1.2 Data Quality Assessment
Data quality assessment:
[A] No missing observations noted for any
variable.
[B] No low variance noted with
First.Second.Mode.Ratio>5.
[C] No low variance noted for any variable with
Unique.Count.Ratio<0.01.
[D] No high skewness noted for any variable with
Skewness>3 or Skewness<(-3).
##################################
# Loading dataset
##################################
DQA <- Sonar_Train
##################################
# Formulating an overall data quality assessment summary
##################################
(DQA.Summary <- data.frame(
Column.Index=c(1:length(names(DQA))),
Column.Name= names(DQA),
Column.Type=sapply(DQA, function(x) class(x)),
Row.Count=sapply(DQA, function(x) nrow(DQA)),
NA.Count=sapply(DQA,function(x)sum(is.na(x))),
Fill.Rate=sapply(DQA,function(x)format(round((sum(!is.na(x))/nrow(DQA)),3),nsmall=3)),
row.names=NULL)
)
## Column.Index Column.Name Column.Type Row.Count NA.Count Fill.Rate
## 1 1 Class factor 136 0 1.000
## 2 2 V1 numeric 136 0 1.000
## 3 3 V11 numeric 136 0 1.000
##################################
# Listing all predictors
##################################
DQA.Predictors <- DQA[,!names(DQA) %in% c("Class")]
##################################
# Listing all numeric predictors
##################################
DQA.Predictors.Numeric <- DQA.Predictors[,sapply(DQA.Predictors, is.numeric)]
if (length(names(DQA.Predictors.Numeric))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Numeric))),
" numeric predictor variable(s)."))
} else {
print("There are no numeric predictor variables.")
}
## [1] "There are 2 numeric predictor variable(s)."
##################################
# Listing all factor predictors
##################################
DQA.Predictors.Factor <- DQA.Predictors[,sapply(DQA.Predictors, is.factor)]
if (length(names(DQA.Predictors.Factor))>0) {
print(paste0("There are ",
(length(names(DQA.Predictors.Factor))),
" factor predictor variable(s)."))
} else {
print("There are no factor predictor variables.")
}
## [1] "There are no factor predictor variables."
##################################
# Formulating a data quality assessment summary for factor predictors
##################################
if (length(names(DQA.Predictors.Factor))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = x[!(x %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return("x"),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Factor.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Factor),
Column.Type=sapply(DQA.Predictors.Factor, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Factor, function(x) length(unique(x))),
First.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(FirstModes(x)[1])),
Second.Mode.Value=sapply(DQA.Predictors.Factor, function(x) as.character(SecondModes(x)[1])),
First.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Factor, function(x) sum(na.omit(x) == SecondModes(x)[1])),
Unique.Count.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Factor)),3), nsmall=3)),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Factor, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
row.names=NULL)
)
}
##################################
# Formulating a data quality assessment summary for numeric predictors
##################################
if (length(names(DQA.Predictors.Numeric))>0) {
##################################
# Formulating a function to determine the first mode
##################################
FirstModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
##################################
# Formulating a function to determine the second mode
##################################
SecondModes <- function(x) {
ux <- unique(na.omit(x))
tab <- tabulate(match(x, ux))
fm = ux[tab == max(tab)]
sm = na.omit(x)[!(na.omit(x) %in% fm)]
usm <- unique(sm)
tabsm <- tabulate(match(sm, usm))
ifelse(is.na(usm[tabsm == max(tabsm)])==TRUE,
return(0.00001),
return(usm[tabsm == max(tabsm)]))
}
(DQA.Predictors.Numeric.Summary <- data.frame(
Column.Name= names(DQA.Predictors.Numeric),
Column.Type=sapply(DQA.Predictors.Numeric, function(x) class(x)),
Unique.Count=sapply(DQA.Predictors.Numeric, function(x) length(unique(x))),
Unique.Count.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((length(unique(x))/nrow(DQA.Predictors.Numeric)),3), nsmall=3)),
First.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((FirstModes(x)[1]),3),nsmall=3)),
Second.Mode.Value=sapply(DQA.Predictors.Numeric, function(x) format(round((SecondModes(x)[1]),3),nsmall=3)),
First.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == FirstModes(x)[1])),
Second.Mode.Count=sapply(DQA.Predictors.Numeric, function(x) sum(na.omit(x) == SecondModes(x)[1])),
First.Second.Mode.Ratio=sapply(DQA.Predictors.Numeric, function(x) format(round((sum(na.omit(x) == FirstModes(x)[1])/sum(na.omit(x) == SecondModes(x)[1])),3), nsmall=3)),
Minimum=sapply(DQA.Predictors.Numeric, function(x) format(round(min(x,na.rm = TRUE),3), nsmall=3)),
Mean=sapply(DQA.Predictors.Numeric, function(x) format(round(mean(x,na.rm = TRUE),3), nsmall=3)),
Median=sapply(DQA.Predictors.Numeric, function(x) format(round(median(x,na.rm = TRUE),3), nsmall=3)),
Maximum=sapply(DQA.Predictors.Numeric, function(x) format(round(max(x,na.rm = TRUE),3), nsmall=3)),
Skewness=sapply(DQA.Predictors.Numeric, function(x) format(round(skewness(x,na.rm = TRUE),3), nsmall=3)),
Kurtosis=sapply(DQA.Predictors.Numeric, function(x) format(round(kurtosis(x,na.rm = TRUE),3), nsmall=3)),
Percentile25th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.25,na.rm = TRUE),3), nsmall=3)),
Percentile75th=sapply(DQA.Predictors.Numeric, function(x) format(round(quantile(x,probs=0.75,na.rm = TRUE),3), nsmall=3)),
row.names=NULL)
)
}
## Column.Name Column.Type Unique.Count Unique.Count.Ratio First.Mode.Value
## 1 V1 numeric 122 0.897 0.020
## 2 V11 numeric 134 0.985 0.213
## Second.Mode.Value First.Mode.Count Second.Mode.Count First.Second.Mode.Ratio
## 1 0.034 3 2 1.500
## 2 0.095 2 1 2.000
## Minimum Mean Median Maximum Skewness Kurtosis Percentile25th Percentile75th
## 1 0.002 0.032 0.024 0.137 1.915 6.988 0.015 0.039
## 2 0.052 0.264 0.250 0.734 0.909 4.151 0.178 0.322
##################################
# Identifying potential data quality issues
##################################
##################################
# Checking for missing observations
##################################
if ((nrow(DQA.Summary[DQA.Summary$NA.Count>0,]))>0){
print(paste0("Missing observations noted for ",
(nrow(DQA.Summary[DQA.Summary$NA.Count>0,])),
" variable(s) with NA.Count>0 and Fill.Rate<1.0."))
DQA.Summary[DQA.Summary$NA.Count>0,]
} else {
print("No missing observations noted.")
}
## [1] "No missing observations noted."
##################################
# Checking for zero or near-zero variance predictors
##################################
if (length(names(DQA.Predictors.Factor))==0) {
print("No factor predictors noted.")
} else if (nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,])),
" factor variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Factor.Summary[as.numeric(as.character(DQA.Predictors.Factor.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance factor predictors due to high first-second mode ratio noted.")
}
## [1] "No factor predictors noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,])),
" numeric variable(s) with First.Second.Mode.Ratio>5."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$First.Second.Mode.Ratio))>5,]
} else {
print("No low variance numeric predictors due to high first-second mode ratio noted.")
}
## [1] "No low variance numeric predictors due to high first-second mode ratio noted."
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])>0){
print(paste0("Low variance observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,])),
" numeric variable(s) with Unique.Count.Ratio<0.01."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Unique.Count.Ratio))<0.01,]
} else {
print("No low variance numeric predictors due to low unique count ratio noted.")
}
## [1] "No low variance numeric predictors due to low unique count ratio noted."
##################################
# Checking for skewed predictors
##################################
if (length(names(DQA.Predictors.Numeric))==0) {
print("No numeric predictors noted.")
} else if (nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])>0){
print(paste0("High skewness observed for ",
(nrow(DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),])),
" numeric variable(s) with Skewness>3 or Skewness<(-3)."))
DQA.Predictors.Numeric.Summary[as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))>3 |
as.numeric(as.character(DQA.Predictors.Numeric.Summary$Skewness))<(-3),]
} else {
print("No skewed numeric predictors noted.")
}
## [1] "No skewed numeric predictors noted."
1.5 Oversampling and Undersampling Algorithms Applied for Class
Imbalance using Logistic Regression
1.5.1 Original Data (LR)
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[C] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a skewed logistic profile with a longer tail for the
predicted points belonging to the majority class.
##################################
# Creating a local object
# for the train set
##################################
PMA_PreModelling_Train_LR <- PMA_PreModelling_Train
PMA_PreModelling_Train_LR$Label <- rep("LR",nrow(PMA_PreModelling_Train_LR))
##################################
# Verifying the class distribution
# for the original data
##################################
table(PMA_PreModelling_Train_LR$Class)
##
## M R
## 111 25
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Original Imbalanced Data Set") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.62998 -0.53562 -0.29014 -0.08872 2.55270
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3152 0.3771 -6.139 8.28e-10 ***
## V1 -0.7399 0.3005 -2.462 0.0138 *
## V11 -1.5607 0.3450 -4.524 6.07e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129.783 on 135 degrees of freedom
## Residual deviance: 88.923 on 133 degrees of freedom
## AIC: 94.923
##
## Number of Fisher Scoring iterations: 6
LR_Model_Coef <- (as.data.frame(LR_Model$coefficients))
LR_Model_Coef$Coef <- rownames(LR_Model_Coef)
LR_Model_Coef$Model <- rep("LR",nrow(LR_Model_Coef))
colnames(LR_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -2.3152181 (Intercept) LR
## V1 -0.7399005 V1 LR
## V11 -1.5607048 V11 LR
##################################
# Computing the model predictions
##################################
(LR_Model_Probabilities <- predict(LR_Model,
type = c("response")))
## 98 99 100 101 102 103
## 0.3637496537 0.0113931997 0.1319142547 0.0021783355 0.0037538060 0.0174606489
## 104 105 106 107 108 109
## 0.0725273351 0.0164891503 0.1569862922 0.4818797362 0.2583425943 0.1252724633
## 110 111 112 113 114 115
## 0.4112142881 0.4680556213 0.1332256210 0.0344698396 0.1348432397 0.1615191484
## 116 117 118 119 120 121
## 0.0662797277 0.2436188082 0.0780323151 0.1104267965 0.0515700587 0.1445535907
## 122 123 124 125 126 127
## 0.1731460430 0.0558634929 0.1081612509 0.0165343094 0.0369004305 0.0031146384
## 128 129 130 131 132 133
## 0.0221776030 0.0200623323 0.0028456905 0.0126580025 0.0050535349 0.0081982533
## 134 135 136 137 138 139
## 0.0025368869 0.0014777580 0.0799936459 0.0023418015 0.0008311849 0.0846443282
## 140 141 142 143 144 145
## 0.2606104200 0.1780753538 0.0394327767 0.0403610789 0.0364744270 0.7351055523
## 146 147 148 149 150 151
## 0.0306057253 0.0228736619 0.0552988212 0.0205374459 0.2190542623 0.2301893864
## 152 153 154 155 156 157
## 0.1183496315 0.1771452296 0.3898752302 0.5789364640 0.6505131954 0.1549759240
## 158 159 160 161 162 163
## 0.1418070654 0.0908007706 0.0533759775 0.0797056569 0.0996424369 0.1113892912
## 164 165 166 167 168 169
## 0.2325914484 0.0352838163 0.0868721918 0.2269151196 0.5811364344 0.6804246275
## 170 171 172 173 174 175
## 0.2377989316 0.1546431081 0.0039867863 0.0484288031 0.0350025026 0.0115543490
## 176 177 178 179 180 181
## 0.0124263395 0.0040952633 0.0623582127 0.2442298264 0.0289787190 0.0051456066
## 182 183 184 185 186 187
## 0.0192102410 0.0354944115 0.0420803608 0.0020214950 0.0046310125 0.0336330699
## 188 189 190 191 192 193
## 0.0359277781 0.1172950153 0.1419944753 0.1265297826 0.1047718509 0.2755585363
## 194 195 196 197 198 199
## 0.1390569085 0.0450510742 0.1550648867 0.2872791732 0.0821006438 0.0726099031
## 200 201 202 203 204 205
## 0.0792285656 0.0532267024 0.0299770042 0.0173915161 0.0553203581 0.0345232209
## 206 207 208 95 57 27
## 0.0333086574 0.0446830571 0.0622826685 0.8967905106 0.2407267465 0.6874636823
## 18 68 92 43 87 64
## 0.4352384578 0.4517614566 0.0687256353 0.6033006196 0.1583196380 0.7793277803
## 16 12 61 13 34 66
## 0.0727076286 0.8563207774 0.3983590501 0.7017484676 0.3096802387 0.8196244700
## 49 94 91 72 23 39
## 0.3431752805 0.0384599213 0.0861723774 0.8744789160 0.4935529921 0.5807289286
## 29 73 77 32
## 0.0739144263 0.5017946378 0.5358622581 0.3382071510
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_Model_Indices <- predict(LR_Model,
type = c("link")))
## 98 99 100 101 102 103
## -0.559126246 -4.463280018 -1.884138369 -6.127013509 -5.581224150 -4.030190682
## 104 105 106 107 108 109
## -2.548499795 -4.088426063 -1.680824728 -0.072512812 -1.054600830 -1.943421380
## 110 111 112 113 114 115
## -0.358947840 -0.127951795 -1.872734622 -3.332592613 -1.858797800 -1.646968043
## 116 117 118 119 120 121
## -2.645292815 -1.132940729 -2.469387136 -2.086388975 -2.911866678 -1.777973140
## 122 123 124 125 126 127
## -1.563492668 -2.827359673 -2.109662164 -4.085645174 -3.261933584 -5.768522713
## 128 129 130 131 132 133
## -3.786245149 -3.888644925 -5.859099785 -4.356726856 -5.282600962 -4.795602114
## 134 135 136 137 138 139
## -5.974277480 -6.515750374 -2.442433371 -6.054490239 -7.091826715 -2.380854599
## 140 141 142 143 144 145
## -1.042798305 -1.529441923 -3.192926600 -3.168691163 -3.273987644 1.020682662
## 146 147 148 149 150 151
## -3.455484324 -3.754629841 -2.838117074 -3.864754157 -1.271186197 -1.207242139
## 152 153 154 155 156 157
## -2.008152348 -1.535809819 -0.447836712 0.318408998 0.621295777 -1.696095344
## 158 159 160 161 162 163
## -1.800361500 -2.303896473 -2.875541209 -2.446352996 -2.201203830 -2.076628048
## 164 165 166 167 168 169
## -1.193735847 -3.308409550 -2.352437879 -1.225812824 0.327440308 0.755723902
## 170 171 172 173 174 175
## -1.164784899 -1.698638961 -5.520775067 -2.978019769 -3.316705945 -4.449071761
## 176 177 178 179 180 181
## -4.375432715 -5.493820583 -2.710472604 -1.129627633 -3.511786651 -5.264453127
## 182 183 184 185 186 187
## -3.932914599 -3.302240366 -3.125182749 -6.201894377 -5.370337994 -3.358033801
## 188 189 190 191 192 193
## -3.289655451 -2.018298780 -1.798822388 -1.931996317 -2.145293464 -0.966600884
## 194 195 196 197 198 199
## -1.823145144 -3.053861030 -1.695416182 -0.908635325 -2.414141893 -2.547272974
## 200 201 202 203 204 205
## -2.452874924 -2.878499479 -3.476889218 -4.034228249 -2.837704887 -3.330989885
## 206 207 208 95 57 27
## -3.368061909 -3.062448770 -2.711765358 2.162061490 -1.148699272 0.788288323
## 18 68 92 43 87 64
## -0.260509533 -0.193556201 -2.606431655 0.419236847 -1.670784310 1.261753296
## 16 12 61 13 34 66
## -2.545822600 1.785061852 -0.412307104 0.855637833 -0.801614636 1.513805316
## 49 94 91 72 23 39
## -0.649175855 -3.218919553 -2.361292279 1.941154440 -0.025789461 0.325766429
## 29 73 77 32
## -2.528058620 0.007178582 0.143695781 -0.671294010
## [1] 2.162061
## [1] -7.091827
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR)
LR_Model_Predictions$LR_Prob <- LR_Model_Probabilities
LR_Model_Predictions$LR_LP <- LR_Model_Indices
LR_Model_Predictions$Class <- as.factor(LR_Model_Predictions$Class)
LR_Model_Predictions$Label <- rep("LR",nrow(LR_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_Model_Predictions %>%
ggplot(aes(x = LR_LP ,
y = LR_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.2 Undersampling - Random Downsampling (LR_US_DOWNSAMPLE)
Random
Downsampling performs a random removal of rows for the majority
class instances to make the occurrence of levels with the minority class
equal.
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The class ratio of the undersampled data was noted
at 50:50, although the number of instances was significantly decreased
by 63%.
[B.1] Majority Class =
Class=M with 25 instances
[B.2] Minority Class =
Class=R with 25 instances
[C] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[D] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a sufficiently balanced logistic profile for the predicted
points from both the majority and minority classes, although the
distribution of instances was relatively sparse.
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Without Undersampling - Random Downsampling") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing US_DOWNSAMPLE
# Visualizing the undersampled data using US_DOWNSAMPLE
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_downsample(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
labs(title = "With Undersampling - Random Downsample") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

US_DOWNSAMPLE <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_downsample(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_US_DOWNSAMPLE <- US_DOWNSAMPLE %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_US_DOWNSAMPLE <- as.data.frame(PMA_PreModelling_Train_LR_US_DOWNSAMPLE))
## V1 V11 Class
## 1 -0.295964244 1.50754826 M
## 2 -0.371994173 0.60103638 M
## 3 -0.789109306 -0.36302243 M
## 4 -0.256318859 -0.63813480 M
## 5 -0.779287302 0.73036007 M
## 6 0.537644499 -0.19150300 M
## 7 0.043933590 0.30731955 M
## 8 0.424204681 0.54321732 M
## 9 1.395977603 1.55085071 M
## 10 1.852613244 0.04399440 M
## 11 -0.507041623 0.38985083 M
## 12 -0.555415865 0.01776805 M
## 13 -2.093160440 0.59563328 M
## 14 0.207991857 -0.39104999 M
## 15 0.278484686 -2.26945660 M
## 16 -0.068999246 0.63930609 M
## 17 -0.935159101 0.03686239 M
## 18 0.424204681 1.89154083 M
## 19 0.625610136 0.17668535 M
## 20 0.256869245 1.19827720 M
## 21 0.695599819 -0.11827986 M
## 22 -0.499153812 0.87301303 M
## 23 0.345301912 -0.27257795 M
## 24 1.231717734 1.85842121 M
## 25 -0.957451233 0.02573746 M
## 26 -2.902302684 -1.49282840 R
## 27 -0.588710342 -0.46833442 R
## 28 -0.597170869 -1.70542147 R
## 29 -0.289270938 -1.17938825 R
## 30 0.544629613 -1.61762409 R
## 31 0.064360561 0.15607894 R
## 32 -0.168320027 -1.67226688 R
## 33 -0.316256467 -0.26298086 R
## 34 -1.638716948 -1.51501038 R
## 35 0.274190661 0.01776805 R
## 36 -0.860055136 -2.21946189 R
## 37 -0.789109306 -0.84516218 R
## 38 -1.427539411 -1.35491277 R
## 39 0.779483548 -1.33935885 R
## 40 -0.400799099 -2.26338258 R
## 41 -0.302692686 -0.92399268 R
## 42 0.827857790 0.18656255 R
## 43 -0.829167731 0.42261407 R
## 44 -2.434915803 -1.57286451 R
## 45 -1.138280881 -0.92728240 R
## 46 -0.860055136 -1.28443852 R
## 47 -1.125398710 0.66990478 R
## 48 -0.186674974 -1.39954451 R
## 49 -0.008605128 -1.57143555 R
## 50 -1.348878827 -0.41384379 R
PMA_PreModelling_Train_LR_US_DOWNSAMPLE$Label <- rep("LR_US_DOWNSAMPLE",nrow(PMA_PreModelling_Train_LR_US_DOWNSAMPLE))
##################################
# Verifying the class distribution
# for the undersampled data using US_DOWNSAMPLE
##################################
table(PMA_PreModelling_Train_LR_US_DOWNSAMPLE$Class)
##
## M R
## 25 25
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_US_DOWNSAMPLE_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_US_DOWNSAMPLE,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_US_DOWNSAMPLE_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_US_DOWNSAMPLE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.44182 -0.67861 0.03524 0.59883 1.93261
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7625 0.4369 -1.745 0.080920 .
## V1 -0.7448 0.5075 -1.468 0.142179
## V11 -1.7181 0.5063 -3.393 0.000691 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 69.315 on 49 degrees of freedom
## Residual deviance: 43.344 on 47 degrees of freedom
## AIC: 49.344
##
## Number of Fisher Scoring iterations: 5
LR_US_DOWNSAMPLE_Model_Coef <- (as.data.frame(LR_US_DOWNSAMPLE_Model$coefficients))
LR_US_DOWNSAMPLE_Model_Coef$Coef <- rownames(LR_US_DOWNSAMPLE_Model_Coef)
LR_US_DOWNSAMPLE_Model_Coef$Model <- rep("LR_US_DOWNSAMPLE",nrow(LR_US_DOWNSAMPLE_Model_Coef))
colnames(LR_US_DOWNSAMPLE_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_US_DOWNSAMPLE_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -0.7625171 (Intercept) LR_US_DOWNSAMPLE
## V1 -0.7448227 V1 LR_US_DOWNSAMPLE
## V11 -1.7180819 V11 LR_US_DOWNSAMPLE
##################################
# Computing the model predictions
##################################
(LR_US_DOWNSAMPLE_Model_Probabilities <- predict(LR_US_DOWNSAMPLE_Model,
type = c("response")))
## 1 2 3 4 5 6
## 0.041799701 0.179744842 0.610385203 0.628260800 0.192025135 0.302810692
## 7 8 9 10 11 12
## 0.210279740 0.117973061 0.011354052 0.098148202 0.258331895 0.406281809
## 13 14 15 16 17 18
## 0.443534294 0.438912131 0.949269888 0.140697941 0.467714486 0.013018286
## 19 20 21 22 23 24
## 0.177694571 0.046860883 0.253998128 0.131171064 0.365540908 0.007593438
## 25 26 27 28 29 30
## 0.476615880 0.981366508 0.617890633 0.931650474 0.814455352 0.833555873
## 31 32 33 34 35 36
## 0.253769009 0.903431085 0.481223964 0.955252856 0.269478992 0.975667022
## 37 38 39 40 41 42
## 0.781987655 0.932679562 0.722728010 0.968467123 0.740857535 0.154510435
## 43 44 45 46 47 48
## 0.295042156 0.977098243 0.842698840 0.889421156 0.254406742 0.855827057
## 49 50
## 0.874763232 0.721753452
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_US_DOWNSAMPLE_Model_Indices <- predict(LR_US_DOWNSAMPLE_Model,
type = c("link")))
## 1 2 3 4 5 6
## -3.13216766 -1.51807716 0.44893169 0.52476284 -1.43690467 -0.83394915
## 7 8 9 10 11 12
## -1.32324004 -2.01176629 -4.46676157 -2.21797160 -1.05465667 -0.37935775
## 13 14 15 16 17 18
## -0.22683042 -0.24557829 2.92917349 -1.80950517 -0.12932199 -4.32829655
## 19 20 21 22 23 24
## -1.53204571 -3.01257759 -1.07740121 -1.89064395 -0.55139462 -4.87284841
## 25 26 27 28 29 30
## -0.09360477 3.96398538 0.48060460 2.61232309 1.47922407 1.61104105
## 31 32 33 34 35 36
## -1.07861075 2.23594294 -0.07513948 3.06094845 -0.99726756 3.69128882
## 37 38 39 40 41 42
## 1.27728727 2.62859781 0.95803402 3.42468383 1.05043038 -1.69965419
## 43 44 45 46 47 48
## -0.87101976 3.75337359 1.67844746 2.08484208 -1.07524589 1.78105475
## 49 50
## 1.94374717 0.95317602
max(LR_US_DOWNSAMPLE_Model_Indices)
## [1] 3.963985
min(LR_US_DOWNSAMPLE_Model_Indices)
## [1] -4.872848
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_US_DOWNSAMPLE_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_US_DOWNSAMPLE)
LR_US_DOWNSAMPLE_Model_Predictions$LR_US_DOWNSAMPLE_Prob <- LR_US_DOWNSAMPLE_Model_Probabilities
LR_US_DOWNSAMPLE_Model_Predictions$LR_US_DOWNSAMPLE_LP <- LR_US_DOWNSAMPLE_Model_Indices
LR_US_DOWNSAMPLE_Model_Predictions$Class <- as.factor(LR_US_DOWNSAMPLE_Model_Predictions$Class)
LR_US_DOWNSAMPLE_Model_Predictions$Label <- rep("LR_US_DOWNSAMPLE",nrow(LR_US_DOWNSAMPLE_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_US_DOWNSAMPLE_Model_Predictions %>%
ggplot(aes(x = LR_US_DOWNSAMPLE_LP ,
y = LR_US_DOWNSAMPLE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (US_DOWNSAMPLE)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.3 Oversampling - Random Upsampling (LR_US_UPSAMPLE)
Random
Upsampling performs a random replication of rows for the minority
class instances to make the occurrence of levels with the majority class
equal.
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The class ratio of the oversampled data was noted
at 50:50, although majority of the added instances were not unique
values but replicates of the rows from the minority class.
[B.1] Majority Class =
Class=M with 111 instances
[B.2] Minority Class =
Class=R with 111 instances
[C] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[D] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a sufficiently balanced logistic profile for the predicted
points from both the majority and minority classes, although the ratio
of the uniques values and number of instances was relatively low.
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Without Oversampling - Random Upsampling") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing OS_UPSAMPLE
# Visualizing the oversampled data using OS_UPSAMPLE
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_upsample(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "With Undersampling - Random Upsample") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

OS_UPSAMPLE <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_upsample(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_OS_UPSAMPLE <- OS_UPSAMPLE %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_OS_UPSAMPLE <- as.data.frame(PMA_PreModelling_Train_LR_OS_UPSAMPLE))
## V1 V11 Class
## 1 0.914240973 -1.55861587 M
## 2 2.175018299 0.34520609 M
## 3 -0.230553865 -0.16690717 M
## 4 1.231717734 1.85842121 M
## 5 0.424204681 1.89154083 M
## 6 1.143139503 0.55690423 M
## 7 -0.507041623 0.38985083 M
## 8 0.312328639 0.98808941 M
## 9 -0.935159101 0.03686239 M
## 10 0.408807916 -1.63079048 M
## 11 0.738227771 -1.15770286 M
## 12 1.169078303 -0.79246144 M
## 13 0.118911997 -1.30982705 M
## 14 -0.174409184 -1.31877657 M
## 15 1.012209983 -0.76338471 M
## 16 0.813818599 0.26605270 M
## 17 0.207991857 -0.39104999 M
## 18 -0.957451233 0.02573746 M
## 19 0.695599819 -0.11827986 M
## 20 -1.204708453 -0.18639848 M
## 21 -0.068999246 0.13149291 M
## 22 0.527094931 -0.39650480 M
## 23 0.104263091 0.33286517 M
## 24 0.465616266 -0.56497212 M
## 25 -0.507041623 -0.24127887 M
## 26 0.043933590 0.30731955 M
## 27 0.147716923 -0.20173692 M
## 28 0.612463710 0.84401925 M
## 29 -0.068999246 0.63930609 M
## 30 1.395977603 1.55085071 M
## 31 -0.180527407 1.02812486 M
## 32 0.565359423 0.74012531 M
## 33 2.230423635 1.21329169 M
## 34 0.782380194 0.93715691 M
## 35 2.005116912 0.95072168 M
## 36 1.784287809 0.74337479 M
## 37 1.523834205 1.62207084 M
## 38 1.928176420 1.77732114 M
## 39 -1.204708453 0.65264083 M
## 40 1.934080462 1.47897609 M
## 41 0.744203377 2.70773324 M
## 42 1.424344244 -0.63319889 M
## 43 -0.491314245 -0.58236264 M
## 44 0.689392711 -0.83030322 M
## 45 1.381555338 -0.09258957 M
## 46 1.002499594 0.07158504 M
## 47 0.977896818 0.15071601 M
## 48 0.278484686 -2.26945660 M
## 49 1.406688795 0.06372536 M
## 50 1.852613244 0.04399440 M
## 51 1.281675939 -0.27257795 M
## 52 1.390588252 0.33359231 M
## 53 -0.192852168 -0.57752146 M
## 54 -0.180527407 -0.62433567 M
## 55 -0.052243905 -0.17198030 M
## 56 -0.779287302 -0.12994977 M
## 57 -0.041194136 -1.17696945 M
## 58 -0.924156756 -1.24933498 M
## 59 -0.168320027 -1.80173334 M
## 60 -2.093160440 0.59563328 M
## 61 -0.230553865 -0.22058604 M
## 62 -1.038676203 0.48516249 M
## 63 -0.030238811 0.37335493 M
## 64 0.089444831 0.04161877 M
## 65 0.303951059 -0.21715050 M
## 66 -0.132380328 -0.09011428 M
## 67 -1.546463816 0.01457488 M
## 68 -0.499153812 0.87301303 M
## 69 -0.108968439 0.07550792 M
## 70 0.686277848 -1.02337267 M
## 71 -0.721885235 -1.35101476 M
## 72 -3.557061230 -0.28132844 M
## 73 -0.789109306 -0.36302243 M
## 74 -0.750264961 -0.03937819 M
## 75 -0.379134945 2.23365699 M
## 76 -0.371994173 0.60103638 M
## 77 0.401039618 0.45156422 M
## 78 -0.295964244 1.50754826 M
## 79 0.256869245 1.19827720 M
## 80 1.243886483 1.44694263 M
## 81 -0.230553865 0.36255507 M
## 82 -0.256318859 -0.63813480 M
## 83 0.632133128 0.46700244 M
## 84 0.324793230 1.73570326 M
## 85 0.723165726 0.69367752 M
## 86 -1.191144673 1.19711994 M
## 87 -1.177722925 1.07730973 M
## 88 0.142960831 2.42255907 M
## 89 0.443194144 1.74741590 M
## 90 -0.180527407 0.75375436 M
## 91 0.544629613 0.36615870 M
## 92 -1.274767709 0.41409622 M
## 93 -0.539087424 -0.07530230 M
## 94 -0.555415865 0.01776805 M
## 95 0.345301912 -0.27257795 M
## 96 -1.868589923 0.02175517 M
## 97 -0.217863016 -0.21200423 M
## 98 0.625610136 0.17668535 M
## 99 -0.799007156 -0.01833539 M
## 100 -2.013850697 0.05347991 M
## 101 0.537644499 -0.19150300 M
## 102 -0.013979414 0.15531332 M
## 103 -0.935159101 0.53154288 M
## 104 -0.779287302 0.73036007 M
## 105 0.424204681 0.54321732 M
## 106 0.157176488 1.02691760 M
## 107 -0.323092564 0.48794823 M
## 108 0.377448180 0.47190071 M
## 109 0.992715078 0.20396774 M
## 110 0.295518362 0.33867807 M
## 111 0.099342681 0.20698569 M
## 112 -2.434915803 -1.57286451 R
## 113 -0.829167731 0.42261407 R
## 114 -0.829167731 0.42261407 R
## 115 -2.434915803 -1.57286451 R
## 116 -1.348878827 -0.41384379 R
## 117 -0.008605128 -1.57143555 R
## 118 -0.168320027 -1.67226688 R
## 119 -1.348878827 -0.41384379 R
## 120 -0.316256467 -0.26298086 R
## 121 -1.427539411 -1.35491277 R
## 122 -0.597170869 -1.70542147 R
## 123 -0.186674974 -1.39954451 R
## 124 -0.400799099 -2.26338258 R
## 125 0.544629613 -1.61762409 R
## 126 -0.789109306 -0.84516218 R
## 127 -0.316256467 -0.26298086 R
## 128 -0.597170869 -1.70542147 R
## 129 -0.588710342 -0.46833442 R
## 130 -0.186674974 -1.39954451 R
## 131 -2.434915803 -1.57286451 R
## 132 0.544629613 -1.61762409 R
## 133 0.827857790 0.18656255 R
## 134 -0.186674974 -1.39954451 R
## 135 -0.588710342 -0.46833442 R
## 136 0.544629613 -1.61762409 R
## 137 -0.186674974 -1.39954451 R
## 138 0.064360561 0.15607894 R
## 139 -0.789109306 -0.84516218 R
## 140 -0.302692686 -0.92399268 R
## 141 -0.860055136 -1.28443852 R
## 142 -0.289270938 -1.17938825 R
## 143 -0.302692686 -0.92399268 R
## 144 0.544629613 -1.61762409 R
## 145 -0.860055136 -1.28443852 R
## 146 -0.186674974 -1.39954451 R
## 147 0.064360561 0.15607894 R
## 148 -0.008605128 -1.57143555 R
## 149 -1.427539411 -1.35491277 R
## 150 -0.588710342 -0.46833442 R
## 151 0.779483548 -1.33935885 R
## 152 -0.860055136 -1.28443852 R
## 153 0.827857790 0.18656255 R
## 154 -0.186674974 -1.39954451 R
## 155 -0.186674974 -1.39954451 R
## 156 -0.588710342 -0.46833442 R
## 157 -2.434915803 -1.57286451 R
## 158 0.779483548 -1.33935885 R
## 159 -0.168320027 -1.67226688 R
## 160 -0.860055136 -2.21946189 R
## 161 -0.588710342 -0.46833442 R
## 162 -2.434915803 -1.57286451 R
## 163 -1.138280881 -0.92728240 R
## 164 -0.008605128 -1.57143555 R
## 165 -0.316256467 -0.26298086 R
## 166 -0.860055136 -2.21946189 R
## 167 0.544629613 -1.61762409 R
## 168 -0.289270938 -1.17938825 R
## 169 -1.638716948 -1.51501038 R
## 170 -1.638716948 -1.51501038 R
## 171 -0.829167731 0.42261407 R
## 172 -0.302692686 -0.92399268 R
## 173 -2.902302684 -1.49282840 R
## 174 -0.588710342 -0.46833442 R
## 175 -0.588710342 -0.46833442 R
## 176 0.064360561 0.15607894 R
## 177 -0.316256467 -0.26298086 R
## 178 -0.186674974 -1.39954451 R
## 179 -1.348878827 -0.41384379 R
## 180 -2.434915803 -1.57286451 R
## 181 -1.138280881 -0.92728240 R
## 182 -0.302692686 -0.92399268 R
## 183 -0.400799099 -2.26338258 R
## 184 -1.638716948 -1.51501038 R
## 185 -1.125398710 0.66990478 R
## 186 -0.860055136 -1.28443852 R
## 187 -0.789109306 -0.84516218 R
## 188 -0.588710342 -0.46833442 R
## 189 -1.138280881 -0.92728240 R
## 190 -0.302692686 -0.92399268 R
## 191 0.779483548 -1.33935885 R
## 192 0.779483548 -1.33935885 R
## 193 -0.860055136 -1.28443852 R
## 194 -1.638716948 -1.51501038 R
## 195 -0.597170869 -1.70542147 R
## 196 0.827857790 0.18656255 R
## 197 -0.302692686 -0.92399268 R
## 198 -1.138280881 -0.92728240 R
## 199 -0.400799099 -2.26338258 R
## 200 -1.348878827 -0.41384379 R
## 201 -0.302692686 -0.92399268 R
## 202 -0.860055136 -2.21946189 R
## 203 -1.348878827 -0.41384379 R
## 204 -0.789109306 -0.84516218 R
## 205 -0.829167731 0.42261407 R
## 206 -0.168320027 -1.67226688 R
## 207 -0.860055136 -2.21946189 R
## 208 -0.400799099 -2.26338258 R
## 209 -1.427539411 -1.35491277 R
## 210 -0.860055136 -2.21946189 R
## 211 -0.597170869 -1.70542147 R
## 212 -0.400799099 -2.26338258 R
## 213 -0.860055136 -1.28443852 R
## 214 -0.302692686 -0.92399268 R
## 215 -1.427539411 -1.35491277 R
## 216 -1.348878827 -0.41384379 R
## 217 0.064360561 0.15607894 R
## 218 -0.168320027 -1.67226688 R
## 219 -0.168320027 -1.67226688 R
## 220 -0.860055136 -1.28443852 R
## 221 -1.638716948 -1.51501038 R
## 222 -0.588710342 -0.46833442 R
PMA_PreModelling_Train_LR_OS_UPSAMPLE$Label <- rep("LR_OS_UPSAMPLE",nrow(PMA_PreModelling_Train_LR_OS_UPSAMPLE))
##################################
# Verifying the class distribution
# for the oversampled data using OS_UPSAMPLE
##################################
table(PMA_PreModelling_Train_LR_OS_UPSAMPLE$Class)
##
## M R
## 111 111
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_OS_UPSAMPLE_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_OS_UPSAMPLE,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_OS_UPSAMPLE_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_OS_UPSAMPLE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.41124 -0.60898 0.06056 0.60185 2.17631
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1647 0.2470 -4.715 2.42e-06 ***
## V1 -0.9110 0.2482 -3.670 0.000243 ***
## V11 -1.8812 0.2634 -7.142 9.17e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 307.76 on 221 degrees of freedom
## Residual deviance: 181.78 on 219 degrees of freedom
## AIC: 187.78
##
## Number of Fisher Scoring iterations: 5
LR_OS_UPSAMPLE_Model_Coef <- (as.data.frame(LR_OS_UPSAMPLE_Model$coefficients))
LR_OS_UPSAMPLE_Model_Coef$Coef <- rownames(LR_OS_UPSAMPLE_Model_Coef)
LR_OS_UPSAMPLE_Model_Coef$Model <- rep("LR_OS_UPSAMPLE",nrow(LR_OS_UPSAMPLE_Model_Coef))
colnames(LR_OS_UPSAMPLE_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_OS_UPSAMPLE_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -1.1646747 (Intercept) LR_OS_UPSAMPLE
## V1 -0.9110338 V1 LR_OS_UPSAMPLE
## V11 -1.8811681 V11 LR_OS_UPSAMPLE
##################################
# Computing the model predictions
##################################
(LR_OS_UPSAMPLE_Model_Probabilities <- predict(LR_OS_UPSAMPLE_Model,
type = c("response")))
## 1 2 3 4 5 6
## 0.7179750734 0.0219760571 0.3450992033 0.0030705786 0.0060031731 0.0371925877
## 7 8 9 10 11 12
## 0.1921472883 0.0352984771 0.4056307613 0.8221055779 0.5843383511 0.3232247505
## 13 14 15 16 17 18
## 0.7669193985 0.8138250288 0.3428144171 0.0826722516 0.3501155120 0.4156101313
## 19 20 21 22 23 24
## 0.1713790099 0.5704046636 0.2060053981 0.2892593757 0.1317198352 0.3714364633
## 25 26 27 28 29 30
## 0.4381049807 0.1439553447 0.2850090965 0.0352163999 0.0907545671 0.0047075528
## 31 32 33 34 35 36
## 0.0504845805 0.0442755776 0.0041558586 0.0255708393 0.0083271556 0.0149403855
## 37 38 39 40 41 42
## 0.0036683904 0.0018985803 0.2150293746 0.0033053895 0.0009708468 0.2190609887
## 43 44 45 46 47 48
## 0.5934995135 0.4425560636 0.0954235210 0.0986203238 0.0879365600 0.9453623871
## 49 50 51 52 53 54
## 0.0713526975 0.0504392980 0.1394866952 0.0448266150 0.5243395342 0.5434581342
## 55 56 57 58 59 60
## 0.3114058838 0.4476277500 0.7478035602 0.8836514796 0.9151368708 0.4065560302
## 61 62 63 64 65 66
## 0.3682636851 0.2439622644 0.1371140479 0.2100782444 0.2624859141 0.2943080924
## 67 68 69 70 71 72
## 0.5539858259 0.0868900350 0.2301536474 0.5337583288 0.8843692868 0.9311925798
## 73 74 75 76 77 78
## 0.5590079462 0.3996122680 0.0065536738 0.1238543705 0.0847490662 0.0234079574
## 79 80 81 82 83 84
## 0.0252623769 0.0065622381 0.1629201398 0.5669159737 0.0679205488 0.0087864263
## 85 86 87 88 89 90
## 0.0419497741 0.0885487004 0.1073279679 0.0028654573 0.0077243548 0.0817993402
## 91 92 93 94 95 96
## 0.0870930602 0.3138237976 0.3700758345 0.3335668084 0.2755803841 0.6216949898
## 97 98 99 100 101 102
## 0.3618419434 0.1123457119 0.4007694305 0.6386262692 0.2151364084 0.1909097870
## 103 104 105 106 107 108
## 0.2120463101 0.1383994535 0.0708949509 0.0376997727 0.1432887529 0.0834577765
## 109 110 111 112 113 114
## 0.0792362717 0.1119463627 0.1618463380 0.9822321769 0.2307141816 0.2307141816
## 115 116 117 118 119 120
## 0.9822321769 0.6990355243 0.8580636277 0.8942101026 0.6990355243 0.4056824362
## 121 122 123 124 125 126
## 0.9361166156 0.9300561845 0.8372859227 0.9694768165 0.7993396439 0.7584357774
## 127 128 129 130 131 132
## 0.4056824362 0.9300561845 0.5628350830 0.8372859227 0.9822321769 0.7993396439
## 133 134 135 136 137 138
## 0.0936520778 0.8372859227 0.5628350830 0.7993396439 0.8372859227 0.1799155074
## 139 140 141 142 143 144
## 0.7584357774 0.7004148673 0.8844343440 0.7887635614 0.7004148673 0.7993396439
## 145 146 147 148 149 150
## 0.8844343440 0.8372859227 0.1799155074 0.8580636277 0.9361166156 0.5628350830
## 151 152 153 154 155 156
## 0.6558261080 0.8844343440 0.0936520778 0.8372859227 0.8372859227 0.5628350830
## 157 158 159 160 161 162
## 0.9822321769 0.6558261080 0.8942101026 0.9779909273 0.5628350830 0.9822321769
## 163 164 165 166 167 168
## 0.8343411495 0.8580636277 0.4056824362 0.9779909273 0.7993396439 0.7887635614
## 169 170 171 172 173 174
## 0.9600071925 0.9600071925 0.2307141816 0.7004148673 0.9864494168 0.5628350830
## 175 176 177 178 179 180
## 0.5628350830 0.1799155074 0.4056824362 0.8372859227 0.6990355243 0.9822321769
## 181 182 183 184 185 186
## 0.8343411495 0.7004148673 0.9694768165 0.9600071925 0.1978792907 0.8844343440
## 187 188 189 190 191 192
## 0.7584357774 0.5628350830 0.8343411495 0.7004148673 0.6558261080 0.6558261080
## 193 194 195 196 197 198
## 0.8844343440 0.9600071925 0.9300561845 0.0936520778 0.7004148673 0.8343411495
## 199 200 201 202 203 204
## 0.9694768165 0.6990355243 0.7004148673 0.9779909273 0.6990355243 0.7584357774
## 205 206 207 208 209 210
## 0.2307141816 0.8942101026 0.9779909273 0.9694768165 0.9361166156 0.9779909273
## 211 212 213 214 215 216
## 0.9300561845 0.9694768165 0.8844343440 0.7004148673 0.9361166156 0.6990355243
## 217 218 219 220 221 222
## 0.1799155074 0.8942101026 0.8942101026 0.8844343440 0.9600071925 0.5628350830
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_OS_UPSAMPLE_Model_Indices <- predict(LR_OS_UPSAMPLE_Model,
type = c("link")))
## 1 2 3 4 5 6
## 0.93443939 -3.79558061 -0.64065185 -5.78281396 -5.10944583 -3.25374392
## 7 8 9 10 11 12
## -1.43611755 -3.30797893 -0.38205745 1.53067859 0.34060860 -0.73899133
## 13 14 15 16 17 18
## 1.19099739 1.47505845 -0.65077721 -2.40658081 -0.61853150 -0.34082071
## 19 20 21 22 23 24
## -1.57588533 0.28350236 -1.34917429 -0.89898378 -1.88583723 -0.52605929
## 25 26 27 28 29 30
## -0.24885647 -1.78281940 -0.91974872 -3.31039195 -2.30445626 -5.35386842
## 31 32 33 34 35 36
## -2.93428382 -3.07203638 -5.47907171 -3.64039921 -4.77987133 -4.18863418
## 37 38 39 40 41 42
## -5.60432715 -6.26474851 -1.29487165 -5.70889009 -6.93637061 -1.27114688
## 43 44 45 46 47 48
## 0.37845127 -0.23079478 -2.24914176 -2.21264920 -2.33909390 2.85084581
## 49 50 51 52 53 54
## -2.56609386 -2.93522886 -1.81955986 -3.05909083 0.09743515 0.17427227
## 55 56 57 58 59 60
## -0.79355485 -0.21026021 1.08693202 2.02747256 2.37803393 -0.37822105
## 61 62 63 64 65 66
## -0.53967287 -1.13107773 -1.83946948 -1.32445384 -1.03308777 -0.87455160
## 67 68 69 70 71 72
## 0.21678837 -2.35221296 -1.20744383 0.13523906 2.03447311 2.60515452
## 73 74 75 76 77 78
## 0.23713683 -0.40708092 -5.02115426 -1.95642588 -2.37950354 -3.73099298
## 79 80 81 82 83 84
## -3.65285212 -5.01983969 -1.63665934 0.26927933 -2.61907944 -4.72572196
## 85 86 87 88 89 90
## -3.12842716 -2.33148548 -2.11832998 -5.85215776 -4.85562264 -2.41814677
## 91 92 93 94 95 96
## -2.34965674 -0.78230278 -0.53189150 -0.69209673 -0.96649144 0.49674883
## 97 98 99 100 101 102
## -0.56737849 -2.06700152 -0.40226017 0.56940673 -1.29423765 -1.44410943
## 103 104 105 106 107 108
## -1.31263462 -1.82864767 -2.57302259 -3.23967242 -1.78823908 -2.39626730
## 109 110 111 112 113 114
## -2.45276930 -2.07101229 -1.64455410 4.01243858 -1.20428294 -1.20428294
## 115 116 117 118 119 120
## 4.01243858 0.84270933 1.79929938 2.13448574 0.84270933 -0.38184312
## 121 122 123 124 125 126
## 2.68468076 2.58755271 1.63817108 3.45827005 1.38217223 1.14412276
## 127 128 129 130 131 132
## -0.38184312 2.58755271 0.25267615 1.63817108 4.01243858 1.38217223
## 133 134 135 136 137 138
## -2.26983664 1.63817108 0.25267615 1.38217223 1.63817108 -1.51692004
## 139 140 141 142 143 144
## 1.14412276 0.84927420 2.03510946 1.31748854 0.84927420 1.38217223
## 145 146 147 148 149 150
## 2.03510946 1.63817108 -1.51692004 1.79929938 2.68468076 0.25267615
## 151 152 153 154 155 156
## 0.64474864 2.03510946 -2.26983664 1.63817108 1.63817108 0.25267615
## 157 158 159 160 161 162
## 4.01243858 0.64474864 2.13448574 3.79404563 0.25267615 4.01243858
## 163 164 165 166 167 168
## 1.61671181 1.79929938 -0.38184312 3.79404563 1.38217223 1.31748854
## 169 170 171 172 173 174
## 3.17824115 3.17824115 -1.20428294 0.84927420 4.28768246 0.25267615
## 175 176 177 178 179 180
## 0.25267615 -1.51692004 -0.38184312 1.63817108 0.84270933 4.01243858
## 181 182 183 184 185 186
## 1.61671181 0.84927420 3.45827005 3.17824115 -1.39960190 2.03510946
## 187 188 189 190 191 192
## 1.14412276 0.25267615 1.61671181 0.84927420 0.64474864 0.64474864
## 193 194 195 196 197 198
## 2.03510946 3.17824115 2.58755271 -2.26983664 0.84927420 1.61671181
## 199 200 201 202 203 204
## 3.45827005 0.84270933 0.84927420 3.79404563 0.84270933 1.14412276
## 205 206 207 208 209 210
## -1.20428294 2.13448574 3.79404563 3.45827005 2.68468076 3.79404563
## 211 212 213 214 215 216
## 2.58755271 3.45827005 2.03510946 0.84927420 2.68468076 0.84270933
## 217 218 219 220 221 222
## -1.51692004 2.13448574 2.13448574 2.03510946 3.17824115 0.25267615
max(LR_OS_UPSAMPLE_Model_Indices)
## [1] 4.287682
min(LR_OS_UPSAMPLE_Model_Indices)
## [1] -6.936371
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_OS_UPSAMPLE_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_OS_UPSAMPLE)
LR_OS_UPSAMPLE_Model_Predictions$LR_OS_UPSAMPLE_Prob <- LR_OS_UPSAMPLE_Model_Probabilities
LR_OS_UPSAMPLE_Model_Predictions$LR_OS_UPSAMPLE_LP <- LR_OS_UPSAMPLE_Model_Indices
LR_OS_UPSAMPLE_Model_Predictions$Class <- as.factor(LR_OS_UPSAMPLE_Model_Predictions$Class)
LR_OS_UPSAMPLE_Model_Predictions$Label <- rep("LR_OS_UPSAMPLE",nrow(LR_OS_UPSAMPLE_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_OS_UPSAMPLE_Model_Predictions %>%
ggplot(aes(x = LR_OS_UPSAMPLE_LP ,
y = LR_OS_UPSAMPLE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (OS_UPSAMPLE)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.4 Undersampling - Near Miss Algorithm (LR_US_NEARMISS)
Near
Miss Algorithm removes majority class instances by undersampling
points in the majority class which have the smallest mean distance to
the defined nearest points in the minority class and based on their
distance to other points in the same class.
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The class ratio of the undersampled data was noted
at 50:50, although the number of instances was significantly decreased
by 63%.
[B.1] Majority Class =
Class=M with 25 instances
[B.2] Minority Class =
Class=R with 25 instances
[C] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[D] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a sufficiently balanced logistic profile for the predicted
points from both the majority and minority classes, although the
distribution of instances was relatively sparse.
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Without Undersampling - Near Miss Algorithm") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing US_NEARMISS
# Visualizing the undersampled data using US_NEARMISS
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_nearmiss(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "With Undersampling - Near Miss Algorithm") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

US_NEARMISS <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_nearmiss(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_US_NEARMISS <- US_NEARMISS %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_US_NEARMISS <- as.data.frame(PMA_PreModelling_Train_LR_US_NEARMISS))
## V1 V11 Class
## 1 0.914240973 -1.55861587 M
## 2 1.143139503 0.55690423 M
## 3 -0.935159101 0.03686239 M
## 4 1.169078303 -0.79246144 M
## 5 -1.204708453 -0.18639848 M
## 6 -0.068999246 0.13149291 M
## 7 0.565359423 0.74012531 M
## 8 2.230423635 1.21329169 M
## 9 0.782380194 0.93715691 M
## 10 1.784287809 0.74337479 M
## 11 1.934080462 1.47897609 M
## 12 0.744203377 2.70773324 M
## 13 1.002499594 0.07158504 M
## 14 1.281675939 -0.27257795 M
## 15 1.390588252 0.33359231 M
## 16 -0.180527407 -0.62433567 M
## 17 -0.168320027 -1.80173334 M
## 18 -0.132380328 -0.09011428 M
## 19 -0.789109306 -0.36302243 M
## 20 0.401039618 0.45156422 M
## 21 1.243886483 1.44694263 M
## 22 0.142960831 2.42255907 M
## 23 0.625610136 0.17668535 M
## 24 0.537644499 -0.19150300 M
## 25 0.157176488 1.02691760 M
## 26 -2.902302684 -1.49282840 R
## 27 -0.588710342 -0.46833442 R
## 28 -0.597170869 -1.70542147 R
## 29 -0.289270938 -1.17938825 R
## 30 0.544629613 -1.61762409 R
## 31 0.064360561 0.15607894 R
## 32 -0.168320027 -1.67226688 R
## 33 -0.316256467 -0.26298086 R
## 34 -1.638716948 -1.51501038 R
## 35 0.274190661 0.01776805 R
## 36 -0.860055136 -2.21946189 R
## 37 -0.789109306 -0.84516218 R
## 38 -1.427539411 -1.35491277 R
## 39 0.779483548 -1.33935885 R
## 40 -0.400799099 -2.26338258 R
## 41 -0.302692686 -0.92399268 R
## 42 0.827857790 0.18656255 R
## 43 -0.829167731 0.42261407 R
## 44 -2.434915803 -1.57286451 R
## 45 -1.138280881 -0.92728240 R
## 46 -0.860055136 -1.28443852 R
## 47 -1.125398710 0.66990478 R
## 48 -0.186674974 -1.39954451 R
## 49 -0.008605128 -1.57143555 R
## 50 -1.348878827 -0.41384379 R
PMA_PreModelling_Train_LR_US_NEARMISS$Label <- rep("LR_US_NEARMISS",nrow(PMA_PreModelling_Train_LR_US_NEARMISS))
##################################
# Verifying the class distribution
# for the undersampled data using US_NEARMISS
##################################
table(PMA_PreModelling_Train_LR_US_NEARMISS$Class)
##
## M R
## 25 25
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_US_NEARMISS_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_US_NEARMISS,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_US_NEARMISS_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_US_NEARMISS)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.01997 -0.52262 -0.00514 0.64473 2.00409
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5276 0.4512 -1.169 0.2423
## V1 -1.3388 0.5264 -2.543 0.0110 *
## V11 -1.2227 0.4795 -2.550 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 69.315 on 49 degrees of freedom
## Residual deviance: 40.340 on 47 degrees of freedom
## AIC: 46.34
##
## Number of Fisher Scoring iterations: 5
LR_US_NEARMISS_Model_Coef <- (as.data.frame(LR_US_NEARMISS_Model$coefficients))
LR_US_NEARMISS_Model_Coef$Coef <- rownames(LR_US_NEARMISS_Model_Coef)
LR_US_NEARMISS_Model_Coef$Model <- rep("LR_US_NEARMISS",nrow(LR_US_NEARMISS_Model_Coef))
colnames(LR_US_NEARMISS_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_US_NEARMISS_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -0.5275579 (Intercept) LR_US_NEARMISS
## V1 -1.3388434 V1 LR_US_NEARMISS
## V11 -1.2227451 V11 LR_US_NEARMISS
##################################
# Computing the model predictions
##################################
(LR_US_NEARMISS_Model_Probabilities <- predict(LR_US_NEARMISS_Model,
type = c("response")))
## 1 2 3 4 5 6
## 0.538475404 0.060711209 0.663604862 0.245308514 0.788060819 0.355268290
## 7 8 9 10 11 12
## 0.100700058 0.006710875 0.061748607 0.021344245 0.007207500 0.007885623
## 13 14 15 16 17 18
## 0.123761916 0.128955243 0.057473709 0.617167616 0.869988474 0.440253186
## 19 20 21 22 23 24
## 0.725681187 0.165669455 0.018666733 0.024575272 0.170626580 0.266350095
## 25 26 27 28 29 30
## 0.119871133 0.994423044 0.697049432 0.913509256 0.786141426 0.672873462
## 31 32 33 34 35 36
## 0.309048509 0.851010973 0.554141527 0.971220746 0.285695645 0.965701640
## 37 38 39 40 41 42
## 0.826690849 0.954366549 0.516626398 0.941397563 0.732532124 0.134231886
## 43 44 45 46 47 48
## 0.516448710 0.990581586 0.893808927 0.899751107 0.539927799 0.807470453
## 49 50
## 0.803043836 0.856239957
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_US_NEARMISS_Model_Indices <- predict(LR_US_NEARMISS_Model,
type = c("link")))
## 1 2 3 4 5 6
## 0.15420647 -2.73899464 0.67940041 -1.12379238 1.31327592 -0.59596102
## 7 8 9 10 11 12
## -2.18947025 -4.99729239 -2.72094650 -3.82539779 -4.92539958 -4.83479722
## 13 14 15 16 17 18
## -1.95727815 -1.91022797 -2.79723621 0.47754340 1.90085685 -0.24013458
## 19 20 21 22 23 24
## 0.97281980 -1.61663509 -3.96216914 -3.68113228 -1.58119307 -1.01322036
## 25 26 27 28 29 30
## -1.99365106 5.18351957 0.83328668 2.35725611 1.30182178 0.72121023
## 31 32 33 34 35 36
## -0.80457137 1.74255237 0.21741854 3.51889902 -0.91638207 3.33775738
## 37 38 39 40 41 42
## 1.56235380 3.04040680 0.06653012 2.77658926 1.00750772 -1.86404831
## 43 44 45 46 47 48
## 0.06581859 4.65562548 2.13025197 2.19446215 0.16005199 1.43365683
## 49 50
## 1.40542811 1.78440512
max(LR_US_NEARMISS_Model_Indices)
## [1] 5.18352
min(LR_US_NEARMISS_Model_Indices)
## [1] -4.997292
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_US_NEARMISS_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_US_NEARMISS)
LR_US_NEARMISS_Model_Predictions$LR_US_NEARMISS_Prob <- LR_US_NEARMISS_Model_Probabilities
LR_US_NEARMISS_Model_Predictions$LR_US_NEARMISS_LP <- LR_US_NEARMISS_Model_Indices
LR_US_NEARMISS_Model_Predictions$Class <- as.factor(LR_US_NEARMISS_Model_Predictions$Class)
LR_US_NEARMISS_Model_Predictions$Label <- rep("LR_US_NEARMISS",nrow(LR_US_NEARMISS_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_US_NEARMISS_Model_Predictions %>%
ggplot(aes(x = LR_US_NEARMISS_LP ,
y = LR_US_NEARMISS_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (US_NEARMISS)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.5 Undersampling - Tomek Links (LR_US_TOMEK)
Tomek Links
remove majority class instances of tomek links referring to a pair of
points from different classes and are each others nearest
neighbors
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The class ratio of the undersampled data was noted
at 85:15 which performed worse than the original data.
[B.1] Majority Class =
Class=M with 102 instances
[B.2] Minority Class =
Class=R with 16 instances
[C] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[D] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a skewed logistic profile with a longer tail for the
predicted points belonging to the majority class.
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Without Undersampling - Tomek Links") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing US_TOMEK
# Visualizing the undersampled data using US_TOMEK
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_tomek(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "With Undersampling - Tomek Links") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

US_TOMEK <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_tomek(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_US_TOMEK <- US_TOMEK %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_US_TOMEK <- as.data.frame(PMA_PreModelling_Train_LR_US_TOMEK))
## V1 V11 Class
## 1 0.914240973 -1.55861587 M
## 2 2.175018299 0.34520609 M
## 3 -0.230553865 -0.16690717 M
## 4 1.231717734 1.85842121 M
## 5 0.424204681 1.89154083 M
## 6 1.143139503 0.55690423 M
## 7 -0.507041623 0.38985083 M
## 8 0.312328639 0.98808941 M
## 9 -0.935159101 0.03686239 M
## 10 0.738227771 -1.15770286 M
## 11 1.169078303 -0.79246144 M
## 12 0.118911997 -1.30982705 M
## 13 1.012209983 -0.76338471 M
## 14 0.207991857 -0.39104999 M
## 15 -0.957451233 0.02573746 M
## 16 0.695599819 -0.11827986 M
## 17 -0.068999246 0.13149291 M
## 18 0.527094931 -0.39650480 M
## 19 0.104263091 0.33286517 M
## 20 0.465616266 -0.56497212 M
## 21 -0.507041623 -0.24127887 M
## 22 0.043933590 0.30731955 M
## 23 0.147716923 -0.20173692 M
## 24 0.612463710 0.84401925 M
## 25 -0.068999246 0.63930609 M
## 26 1.395977603 1.55085071 M
## 27 -0.180527407 1.02812486 M
## 28 0.565359423 0.74012531 M
## 29 2.230423635 1.21329169 M
## 30 0.782380194 0.93715691 M
## 31 2.005116912 0.95072168 M
## 32 1.784287809 0.74337479 M
## 33 1.523834205 1.62207084 M
## 34 1.928176420 1.77732114 M
## 35 1.934080462 1.47897609 M
## 36 0.744203377 2.70773324 M
## 37 1.424344244 -0.63319889 M
## 38 0.689392711 -0.83030322 M
## 39 1.381555338 -0.09258957 M
## 40 1.002499594 0.07158504 M
## 41 0.977896818 0.15071601 M
## 42 0.278484686 -2.26945660 M
## 43 1.406688795 0.06372536 M
## 44 1.852613244 0.04399440 M
## 45 1.281675939 -0.27257795 M
## 46 1.390588252 0.33359231 M
## 47 -0.192852168 -0.57752146 M
## 48 -0.180527407 -0.62433567 M
## 49 -0.052243905 -0.17198030 M
## 50 -0.779287302 -0.12994977 M
## 51 -0.041194136 -1.17696945 M
## 52 -2.093160440 0.59563328 M
## 53 -0.230553865 -0.22058604 M
## 54 -1.038676203 0.48516249 M
## 55 -0.030238811 0.37335493 M
## 56 0.089444831 0.04161877 M
## 57 0.303951059 -0.21715050 M
## 58 -0.132380328 -0.09011428 M
## 59 -1.546463816 0.01457488 M
## 60 -0.499153812 0.87301303 M
## 61 -0.108968439 0.07550792 M
## 62 0.686277848 -1.02337267 M
## 63 -0.721885235 -1.35101476 M
## 64 -3.557061230 -0.28132844 M
## 65 -0.789109306 -0.36302243 M
## 66 -0.750264961 -0.03937819 M
## 67 -0.379134945 2.23365699 M
## 68 -0.371994173 0.60103638 M
## 69 0.401039618 0.45156422 M
## 70 -0.295964244 1.50754826 M
## 71 0.256869245 1.19827720 M
## 72 1.243886483 1.44694263 M
## 73 -0.230553865 0.36255507 M
## 74 -0.256318859 -0.63813480 M
## 75 0.632133128 0.46700244 M
## 76 0.324793230 1.73570326 M
## 77 0.723165726 0.69367752 M
## 78 -1.191144673 1.19711994 M
## 79 -1.177722925 1.07730973 M
## 80 0.142960831 2.42255907 M
## 81 0.443194144 1.74741590 M
## 82 -0.180527407 0.75375436 M
## 83 0.544629613 0.36615870 M
## 84 -1.274767709 0.41409622 M
## 85 -0.539087424 -0.07530230 M
## 86 -0.555415865 0.01776805 M
## 87 0.345301912 -0.27257795 M
## 88 -1.868589923 0.02175517 M
## 89 -0.217863016 -0.21200423 M
## 90 0.625610136 0.17668535 M
## 91 -0.799007156 -0.01833539 M
## 92 -2.013850697 0.05347991 M
## 93 0.537644499 -0.19150300 M
## 94 -0.013979414 0.15531332 M
## 95 -0.935159101 0.53154288 M
## 96 -0.779287302 0.73036007 M
## 97 0.424204681 0.54321732 M
## 98 0.157176488 1.02691760 M
## 99 -0.323092564 0.48794823 M
## 100 0.377448180 0.47190071 M
## 101 0.992715078 0.20396774 M
## 102 0.295518362 0.33867807 M
## 103 -2.902302684 -1.49282840 R
## 104 -0.597170869 -1.70542147 R
## 105 -0.289270938 -1.17938825 R
## 106 -0.316256467 -0.26298086 R
## 107 -1.638716948 -1.51501038 R
## 108 0.274190661 0.01776805 R
## 109 -0.860055136 -2.21946189 R
## 110 -0.789109306 -0.84516218 R
## 111 -1.427539411 -1.35491277 R
## 112 0.779483548 -1.33935885 R
## 113 -0.400799099 -2.26338258 R
## 114 -0.302692686 -0.92399268 R
## 115 -0.829167731 0.42261407 R
## 116 -2.434915803 -1.57286451 R
## 117 -1.138280881 -0.92728240 R
## 118 -0.008605128 -1.57143555 R
PMA_PreModelling_Train_LR_US_TOMEK$Label <- rep("LR_US_TOMEK",nrow(PMA_PreModelling_Train_LR_US_TOMEK))
##################################
# Verifying the class distribution
# for the undersampled data using US_TOMEK
##################################
table(PMA_PreModelling_Train_LR_US_TOMEK$Class)
##
## M R
## 102 16
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_US_TOMEK_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_US_TOMEK,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_US_TOMEK_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_US_TOMEK)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.88129 -0.38305 -0.18048 -0.04904 2.61741
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.1192 0.5681 -5.491 4.00e-08 ***
## V1 -0.8550 0.3663 -2.334 0.0196 *
## V11 -2.1768 0.5199 -4.187 2.83e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 93.664 on 117 degrees of freedom
## Residual deviance: 51.097 on 115 degrees of freedom
## AIC: 57.097
##
## Number of Fisher Scoring iterations: 7
LR_US_TOMEK_Model_Coef <- (as.data.frame(LR_US_TOMEK_Model$coefficients))
LR_US_TOMEK_Model_Coef$Coef <- rownames(LR_US_TOMEK_Model_Coef)
LR_US_TOMEK_Model_Coef$Model <- rep("LR_US_TOMEK",nrow(LR_US_TOMEK_Model_Coef))
colnames(LR_US_TOMEK_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_US_TOMEK_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -3.1192250 (Intercept) LR_US_TOMEK
## V1 -0.8549776 V1 LR_US_TOMEK
## V11 -2.1767958 V11 LR_US_TOMEK
##################################
# Computing the model predictions
##################################
(LR_US_TOMEK_Model_Probabilities <- predict(LR_US_TOMEK_Model,
type = c("response")))
## 1 2 3 4 5 6
## 3.756409e-01 3.235729e-03 7.183859e-02 2.697569e-04 5.004968e-04 4.923252e-03
## 7 8 9 10 11 12
## 2.835075e-02 3.922375e-03 8.317828e-02 2.261271e-01 8.365176e-02 4.086217e-01
## 13 14 15 16 17 18
## 8.924332e-02 7.974495e-02 8.653858e-02 3.057631e-02 3.401110e-02 6.257509e-02
## 19 20 21 22 23 24
## 1.920963e-02 9.216455e-02 1.033540e-01 2.133670e-02 5.698041e-02 4.151500e-03
## 25 26 27 28 29 30
## 1.152242e-02 4.577910e-04 5.470520e-03 5.412073e-03 4.676720e-04 2.934903e-03
## 31 32 33 34 35 36
## 1.003676e-03 1.902036e-03 3.514870e-04 1.774511e-04 3.379585e-04 6.444174e-05
## 37 38 39 40 41 42
## 4.932868e-02 1.299682e-01 1.632055e-02 1.579461e-02 1.360818e-02 8.296036e-01
## 43 44 45 46 47 48
## 1.142309e-02 8.171200e-03 2.604136e-02 6.468579e-03 1.548320e-01 1.671688e-01
## 49 50 51 52 53 54
## 6.296205e-02 1.024699e-01 3.723887e-01 6.747167e-02 8.003027e-02 3.601015e-02
## 55 56 57 58 59 60
## 1.972224e-02 3.604439e-02 5.183789e-02 5.679248e-02 1.383861e-01 1.002269e-02
## 61 62 63 64 65 66
## 3.952765e-02 1.856837e-01 6.079859e-01 6.305035e-01 1.605248e-01 8.378117e-02
## 67 68 69 70 71 72
## 4.723640e-04 1.615066e-02 1.160016e-02 2.133604e-03 2.606214e-03 6.535624e-04
## 73 74 75 76 77 78
## 2.386204e-02 1.807923e-01 9.227894e-03 7.647569e-04 5.233073e-03 8.953573e-03
## 79 80 81 82 83 84
## 1.145983e-02 2.004250e-04 6.737929e-04 9.896313e-03 1.234693e-02 5.065520e-02
## 85 86 87 88 89 90
## 7.625172e-02 6.398192e-02 5.619436e-02 1.723620e-01 7.788281e-02 1.731500e-02
## 91 92 93 94 95 96
## 8.346450e-02 1.803700e-01 4.061963e-02 3.090759e-02 2.998119e-02 1.724570e-02
## 97 98 99 100 101 102
## 9.337053e-03 4.114982e-03 1.974037e-02 1.132709e-02 1.198585e-02 1.615689e-02
## 103 104 105 106 107 108
## 9.316147e-01 7.509522e-01 4.244291e-01 9.309838e-02 8.291634e-01 3.253605e-02
## 109 110 111 112 113 114
## 9.203724e-01 3.532443e-01 7.409012e-01 2.952220e-01 8.957034e-01 2.996246e-01
## 115 116 117 118
## 3.454790e-02 9.157827e-01 4.681641e-01 5.765988e-01
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_US_TOMEK_Model_Indices <- predict(LR_US_TOMEK_Model,
type = c("link")))
## 1 2 3 4 5 6 7
## -0.5080921 -5.7302601 -2.5587838 -8.2177196 -7.5994087 -5.3088505 -3.5343414
## 8 9 10 11 12 13 14
## -5.5371279 -2.3999268 -1.2303105 -2.3937341 -0.3696661 -2.3229093 -2.4458174
## 15 16 17 18 19 20 21
## -2.3566509 -3.4564762 -3.3464654 -2.7067694 -3.9329471 -2.2874876 -2.1605009
## 22 23 24 25 26 27 28
## -3.8257592 -2.8063796 -5.4801254 -4.4518710 -7.6886399 -5.2028960 -5.2136964
## 29 30 31 32 33 34 35
## -7.6672756 -5.8281418 -6.9030821 -6.2629263 -7.9529862 -8.6366379 -7.9922495
## 36 37 38 39 40 41 42
## -9.6496846 -2.9586628 -1.9012398 -4.0988753 -4.1321657 -4.2833829 1.5828204
## 43 44 45 46 47 48 49
## -4.4606296 -4.7989347 -3.6216827 -5.0343092 -1.6971944 -1.6058269 -2.7001917
## 50 51 52 53 54 55 56
## -2.1700777 -0.5219828 -2.6261917 -2.4419359 -3.2872798 -3.9060890 -3.2862939
## 57 58 59 60 61 62 63
## -2.9064041 -2.8098824 -1.8287596 -4.5928308 -3.1904248 -1.4783039 0.4388540
## 64 65 66 67 68 69 70
## 0.5343773 -1.6543285 -2.3920470 -7.6572883 -4.1095118 -4.4450680 -6.1478070
## 71 72 73 74 75 76 77
## -5.9472473 -7.3324188 -3.7113150 -1.5109890 -4.6762536 -7.1751876 -5.2475099
## 78 79 80 81 82 83 84
## -4.7067087 -4.4573816 -8.5148698 -7.3019137 -4.6056475 -4.3819239 -2.9307301
## 85 86 87 88 89 90 91
## -2.4943996 -2.6830343 -2.8211039 -1.5689790 -2.4714671 -4.0387156 -2.3961794
## 92 93 94 95 96 97 98
## -1.5138426 -3.1620361 -3.4453583 -3.4767452 -4.0427966 -4.6643837 -5.4889973
## 99 100 101 102 103 104 105
## -3.9051518 -4.4691663 -4.4119703 -4.1091196 2.6117614 1.1036970 -0.3046174
## 106 107 108 109 110 111 112
## -2.2763772 1.5797095 -3.3923293 2.4474182 -0.6048087 1.0506577 -0.8701553
## 113 114 115 116 117 118
## 2.1503710 -0.8490861 -3.3302497 2.3863784 -0.1275159 0.3088265
max(LR_US_TOMEK_Model_Indices)
## [1] 2.611761
min(LR_US_TOMEK_Model_Indices)
## [1] -9.649685
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_US_TOMEK_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_US_TOMEK)
LR_US_TOMEK_Model_Predictions$LR_US_TOMEK_Prob <- LR_US_TOMEK_Model_Probabilities
LR_US_TOMEK_Model_Predictions$LR_US_TOMEK_LP <- LR_US_TOMEK_Model_Indices
LR_US_TOMEK_Model_Predictions$Class <- as.factor(LR_US_TOMEK_Model_Predictions$Class)
LR_US_TOMEK_Model_Predictions$Label <- rep("LR_US_TOMEK",nrow(LR_US_TOMEK_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_US_TOMEK_Model_Predictions %>%
ggplot(aes(x = LR_US_TOMEK_LP ,
y = LR_US_TOMEK_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (US_TOMEK)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.6 Oversampling - Adaptive Synthetic Algorithm
(LR_OS_ADASYN)
Adaptive
Synthetic Algorithm uses a weighted distribution for different
minority class instances according to their level of difficulty in
learning. Synthetic data is generated for minority class instances that
are harder to learn compared to those minority instances that are easier
to learn. The algorithm improves learning with respect to the data
distributions by reducing the bias introduced by the class imbalance and
adaptively shifting the classification decision boundary toward the
difficult examples.
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The class ratio of the oversampled data was noted
at 50:50 with majority of the added instances being unique values for
the minority class.
[B.1] Majority Class =
Class=M with 111 instances
[B.2] Minority Class =
Class=R with 111 instances
[C] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[D] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a sufficiently balanced logistic profile for the predicted
points from both the majority and minority classes. Although the ratio
of the uniques values and number of instances was relatively high, the
added instances tended to cluster at the center of the data
distribution.
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
labs(title = "Without Oversampling - Adaptive Synthetic Algorithm") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing OS_ADASYN
# Visualizing the oversampled data using OS_ADASYN
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_adasyn(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "With Oversampling - Adaptive Synthetic Algorithm") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

OS_ADASYN <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_adasyn(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_OS_ADASYN <- OS_ADASYN %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_OS_ADASYN <- as.data.frame(PMA_PreModelling_Train_LR_OS_ADASYN))
## V1 V11 Class
## 1 0.914240973 -1.558615871 M
## 2 2.175018299 0.345206089 M
## 3 -0.230553865 -0.166907171 M
## 4 1.231717734 1.858421213 M
## 5 0.424204681 1.891540835 M
## 6 1.143139503 0.556904232 M
## 7 -0.507041623 0.389850825 M
## 8 0.312328639 0.988089406 M
## 9 -0.935159101 0.036862390 M
## 10 0.408807916 -1.630790485 M
## 11 0.738227771 -1.157702863 M
## 12 1.169078303 -0.792461442 M
## 13 0.118911997 -1.309827049 M
## 14 -0.174409184 -1.318776567 M
## 15 1.012209983 -0.763384712 M
## 16 0.813818599 0.266052702 M
## 17 0.207991857 -0.391049992 M
## 18 -0.957451233 0.025737464 M
## 19 0.695599819 -0.118279862 M
## 20 -1.204708453 -0.186398477 M
## 21 -0.068999246 0.131492907 M
## 22 0.527094931 -0.396504803 M
## 23 0.104263091 0.332865171 M
## 24 0.465616266 -0.564972117 M
## 25 -0.507041623 -0.241278874 M
## 26 0.043933590 0.307319548 M
## 27 0.147716923 -0.201736917 M
## 28 0.612463710 0.844019251 M
## 29 -0.068999246 0.639306087 M
## 30 1.395977603 1.550850706 M
## 31 -0.180527407 1.028124858 M
## 32 0.565359423 0.740125313 M
## 33 2.230423635 1.213291690 M
## 34 0.782380194 0.937156911 M
## 35 2.005116912 0.950721679 M
## 36 1.784287809 0.743374785 M
## 37 1.523834205 1.622070841 M
## 38 1.928176420 1.777321144 M
## 39 -1.204708453 0.652640830 M
## 40 1.934080462 1.478976095 M
## 41 0.744203377 2.707733238 M
## 42 1.424344244 -0.633198890 M
## 43 -0.491314245 -0.582362640 M
## 44 0.689392711 -0.830303223 M
## 45 1.381555338 -0.092589575 M
## 46 1.002499594 0.071585037 M
## 47 0.977896818 0.150716010 M
## 48 0.278484686 -2.269456602 M
## 49 1.406688795 0.063725362 M
## 50 1.852613244 0.043994401 M
## 51 1.281675939 -0.272577947 M
## 52 1.390588252 0.333592312 M
## 53 -0.192852168 -0.577521460 M
## 54 -0.180527407 -0.624335667 M
## 55 -0.052243905 -0.171980296 M
## 56 -0.779287302 -0.129949769 M
## 57 -0.041194136 -1.176969454 M
## 58 -0.924156756 -1.249334984 M
## 59 -0.168320027 -1.801733345 M
## 60 -2.093160440 0.595633277 M
## 61 -0.230553865 -0.220586042 M
## 62 -1.038676203 0.485162490 M
## 63 -0.030238811 0.373354928 M
## 64 0.089444831 0.041618772 M
## 65 0.303951059 -0.217150498 M
## 66 -0.132380328 -0.090114281 M
## 67 -1.546463816 0.014574884 M
## 68 -0.499153812 0.873013033 M
## 69 -0.108968439 0.075507923 M
## 70 0.686277848 -1.023372673 M
## 71 -0.721885235 -1.351014760 M
## 72 -3.557061230 -0.281328435 M
## 73 -0.789109306 -0.363022432 M
## 74 -0.750264961 -0.039378188 M
## 75 -0.379134945 2.233656987 M
## 76 -0.371994173 0.601036378 M
## 77 0.401039618 0.451564218 M
## 78 -0.295964244 1.507548259 M
## 79 0.256869245 1.198277196 M
## 80 1.243886483 1.446942626 M
## 81 -0.230553865 0.362555066 M
## 82 -0.256318859 -0.638134800 M
## 83 0.632133128 0.467002438 M
## 84 0.324793230 1.735703263 M
## 85 0.723165726 0.693677522 M
## 86 -1.191144673 1.197119945 M
## 87 -1.177722925 1.077309728 M
## 88 0.142960831 2.422559073 M
## 89 0.443194144 1.747415902 M
## 90 -0.180527407 0.753754356 M
## 91 0.544629613 0.366158697 M
## 92 -1.274767709 0.414096217 M
## 93 -0.539087424 -0.075302304 M
## 94 -0.555415865 0.017768054 M
## 95 0.345301912 -0.272577947 M
## 96 -1.868589923 0.021755168 M
## 97 -0.217863016 -0.212004228 M
## 98 0.625610136 0.176685353 M
## 99 -0.799007156 -0.018335390 M
## 100 -2.013850697 0.053479912 M
## 101 0.537644499 -0.191503002 M
## 102 -0.013979414 0.155313323 M
## 103 -0.935159101 0.531542880 M
## 104 -0.779287302 0.730360068 M
## 105 0.424204681 0.543217319 M
## 106 0.157176488 1.026917595 M
## 107 -0.323092564 0.487948233 M
## 108 0.377448180 0.471900708 M
## 109 0.992715078 0.203967739 M
## 110 0.295518362 0.338678072 M
## 111 0.099342681 0.206985690 M
## 112 -2.902302684 -1.492828399 R
## 113 -0.588710342 -0.468334419 R
## 114 -0.597170869 -1.705421467 R
## 115 -0.289270938 -1.179388252 R
## 116 0.544629613 -1.617624095 R
## 117 0.064360561 0.156078935 R
## 118 -0.168320027 -1.672266879 R
## 119 -0.316256467 -0.262980859 R
## 120 -1.638716948 -1.515010380 R
## 121 0.274190661 0.017768054 R
## 122 -0.860055136 -2.219461886 R
## 123 -0.789109306 -0.845162176 R
## 124 -1.427539411 -1.354912772 R
## 125 0.779483548 -1.339358851 R
## 126 -0.400799099 -2.263382579 R
## 127 -0.302692686 -0.923992684 R
## 128 0.827857790 0.186562547 R
## 129 -0.829167731 0.422614069 R
## 130 -2.434915803 -1.572864509 R
## 131 -1.138280881 -0.927282397 R
## 132 -0.860055136 -1.284438518 R
## 133 -1.125398710 0.669904780 R
## 134 -0.186674974 -1.399544510 R
## 135 -0.008605128 -1.571435552 R
## 136 -1.348878827 -0.413843793 R
## 137 -0.390322558 -0.784388451 R
## 138 -0.295860799 -1.053993156 R
## 139 -0.017377454 -1.559181951 R
## 140 -0.191911171 -1.315384987 R
## 141 -0.290855568 -1.149235140 R
## 142 -0.035273087 -1.662069728 R
## 143 0.346506687 -1.632808882 R
## 144 0.682389831 -1.454399746 R
## 145 -0.356332622 -0.246153717 R
## 146 0.266972171 0.022526169 R
## 147 0.652376435 0.179556227 R
## 148 -0.342166539 0.277343988 R
## 149 0.104992266 0.129296282 R
## 150 0.128805610 0.158651987 R
## 151 -0.082002824 0.016138154 R
## 152 -0.096321170 0.002448115 R
## 153 -0.178557486 -1.673058340 R
## 154 -0.063390608 -1.606022761 R
## 155 -0.176865876 -1.545290531 R
## 156 -0.180838432 -1.486265301 R
## 157 -0.358706919 -2.156356393 R
## 158 -0.349557028 -0.303980855 R
## 159 -0.582661828 -0.590981902 R
## 160 0.038582729 -0.094260047 R
## 161 0.035702482 0.124526356 R
## 162 -0.574843937 -0.457883052 R
## 163 0.082506030 -0.073375165 R
## 164 -0.529153404 -0.423445246 R
## 165 -0.390201423 -0.354022656 R
## 166 -0.059449742 0.019763646 R
## 167 -0.557747942 -0.560307667 R
## 168 -0.308189318 -0.656122036 R
## 169 0.219359952 0.053910074 R
## 170 0.748334227 0.162318486 R
## 171 0.167225292 0.088274976 R
## 172 0.248268872 0.034854570 R
## 173 0.755598258 0.164533045 R
## 174 0.219609003 -0.008184720 R
## 175 -0.130244888 -0.642472049 R
## 176 -0.784077436 -2.070896097 R
## 177 -0.647882130 -1.804581595 R
## 178 -0.512560514 -2.252694339 R
## 179 -0.807753229 -0.960600033 R
## 180 -0.318364841 -0.921452795 R
## 181 -0.557028431 -1.000347302 R
## 182 -1.342933236 -1.229833763 R
## 183 -1.325016764 -1.203346635 R
## 184 -0.862607677 -0.903846499 R
## 185 0.045146002 -1.229443601 R
## 186 0.437074722 -1.360688788 R
## 187 -0.077403279 -1.640333136 R
## 188 -0.125638032 -1.657275187 R
## 189 -0.517275875 -0.889216525 R
## 190 -0.294167245 -1.086218926 R
## 191 -0.310113401 -0.562354563 R
## 192 -0.297566301 -1.021540043 R
## 193 -0.310870399 -0.525463291 R
## 194 0.703852721 0.148757571 R
## 195 -0.050300971 -0.158482083 R
## 196 0.450144116 0.038151940 R
## 197 0.112591704 -0.144113810 R
## 198 0.727653830 0.156013729 R
## 199 0.205014899 -0.058163811 R
## 200 0.820894004 -0.033103728 R
## 201 -0.725082580 0.036955271 R
## 202 -0.655946484 -0.219209453 R
## 203 -0.723124380 0.280868725 R
## 204 -1.121817738 0.666915420 R
## 205 -1.106401172 -0.023584008 R
## 206 -0.157000177 0.251678836 R
## 207 -1.123828064 0.666573667 R
## 208 -0.938882051 0.514202506 R
## 209 -0.930462819 0.256473662 R
## 210 -0.872827627 0.459060921 R
## 211 -0.171945746 -1.413762652 R
## 212 -0.180283482 -1.405714226 R
## 213 -0.016300987 -1.560685612 R
## 214 -0.083518808 -1.618730111 R
## 215 -0.505285997 -1.684504044 R
## 216 -1.174123534 -0.839897872 R
## 217 -1.279860500 -0.582110741 R
## 218 -1.377082750 -0.751266116 R
## 219 -0.970648251 -0.705281241 R
## 220 -1.190222435 -0.536093069 R
## 221 -1.196659723 -0.784954555 R
## 222 -1.253287340 -0.646896207 R
PMA_PreModelling_Train_LR_OS_ADASYN$Label <- rep("LR_OS_ADASYN",nrow(PMA_PreModelling_Train_LR_OS_ADASYN))
##################################
# Verifying the class distribution
# for the oversampled data using OS_ADASYN
##################################
table(PMA_PreModelling_Train_LR_OS_ADASYN$Class)
##
## M R
## 111 111
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_OS_ADASYN_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_OS_ADASYN,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_OS_ADASYN_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_OS_ADASYN)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.19658 -0.90252 0.05818 0.80875 1.68379
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4101 0.1704 -2.406 0.0161 *
## V1 -0.5944 0.2092 -2.841 0.0045 **
## V11 -1.2753 0.2148 -5.938 2.89e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 307.76 on 221 degrees of freedom
## Residual deviance: 238.51 on 219 degrees of freedom
## AIC: 244.51
##
## Number of Fisher Scoring iterations: 4
LR_OS_ADASYN_Model_Coef <- (as.data.frame(LR_OS_ADASYN_Model$coefficients))
LR_OS_ADASYN_Model_Coef$Coef <- rownames(LR_OS_ADASYN_Model_Coef)
LR_OS_ADASYN_Model_Coef$Model <- rep("LR_OS_ADASYN",nrow(LR_OS_ADASYN_Model_Coef))
colnames(LR_OS_ADASYN_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_OS_ADASYN_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -0.4101181 (Intercept) LR_OS_ADASYN
## V1 -0.5943822 V1 LR_OS_ADASYN
## V11 -1.2753114 V11 LR_OS_ADASYN
##################################
# Computing the model predictions
##################################
(LR_OS_ADASYN_Model_Probabilities <- predict(LR_OS_ADASYN_Model,
type = c("response")))
## 1 2 3 4 5 6 7
## 0.73772818 0.10497255 0.48494896 0.02896442 0.04416885 0.14187466 0.35298982
## 8 9 10 11 12 13 14
## 0.13518371 0.52465818 0.80638030 0.65192786 0.47642692 0.76667670 0.79825096
## 15 16 17 18 19 20 21
## 0.49045006 0.22563277 0.49124234 0.53149589 0.33788658 0.63266232 0.36893216
## 22 23 24 25 26 27 28
## 0.44577737 0.28975032 0.50841003 0.54957759 0.30403436 0.44012839 0.13580915
## 29 30 31 32 33 34 35
## 0.23425830 0.03850579 0.16603467 0.15576953 0.03615320 0.11201542 0.05655150
## 36 37 38 39 40 41 42
## 0.08175770 0.03278329 0.02139795 0.37136132 0.03089268 0.01331233 0.38955167
## 43 44 45 46 47 48 49
## 0.65126495 0.55947074 0.24727303 0.25025003 0.23441048 0.91040741 0.20957344
## 50 51 52 53 54 55 56
## 0.17258950 0.30485130 0.15948123 0.60850432 0.62090861 0.46015049 0.55448352
## 57 58 59 60 61 62 63
## 0.75312881 0.84972878 0.87949954 0.51859174 0.50205872 0.39855712 0.29561056
## 64 65 66 67 68 69 70
## 0.37370392 0.42217609 0.44608300 0.62022067 0.22674314 0.39134899 0.61942054
## 71 72 73 74 75 76 77
## 0.85093106 0.88724633 0.62758752 0.52149811 0.04594066 0.27777607 0.22716764
## 78 79 80 81 82 83 84
## 0.10369789 0.10997728 0.04766295 0.32399955 0.63553899 0.20078099 0.05642670
## 85 86 87 88 89 90 91
## 0.15127834 0.22638738 0.25274867 0.02699838 0.05205260 0.22027108 0.23133247
## 92 93 94 95 96 97 98
## 0.45499203 0.50158494 0.47436036 0.43346259 0.66212847 0.49743683 0.26750888
## 99 100 101 102 103 104 105
## 0.52203094 0.67231525 0.38096402 0.35437073 0.37001350 0.29351771 0.20504940
## 106 107 108 109 110 111 112
## 0.14025414 0.30145821 0.22508296 0.22092352 0.26547781 0.32450675 0.96153492
## 113 114 115 116 117 118 119
## 0.63113128 0.89281022 0.78004124 0.79069806 0.34357069 0.86087655 0.52828019
## 120 121 122 123 124 125 126
## 0.92386207 0.35531618 0.94938606 0.75708417 0.89718416 0.69734156 0.93788428
## 127 128 129 130 131 132 133
## 0.72074811 0.24230065 0.38787815 0.95448129 0.80984905 0.85057811 0.35536511
## 134 135 136 137 138 139 140
## 0.81543098 0.83188786 0.71492212 0.69470786 0.75211155 0.83042655 0.79922795
## 141 142 143 144 145 146 147
## 0.77353736 0.84947689 0.81248892 0.73867519 0.52886844 0.35490911 0.26369154
## 148 149 150 151 152 153 154
## 0.36344558 0.34583074 0.33425366 0.40565468 0.41193095 0.86172405 0.84233953
## 155 156 157 158 159 160 161
## 0.84100629 0.83101251 0.92778263 0.54619851 0.66594284 0.42242166 0.35660069
## 162 163 164 165 166 167 168
## 0.62609548 0.40960899 0.60931311 0.56790283 0.40131546 0.65384058 0.64790067
## 169 170 171 172 173 174 175
## 0.35222942 0.25694247 0.34930583 0.35385536 0.25558129 0.37046751 0.61931673
## 176 177 178 179 180 181 182
## 0.93684708 0.90690144 0.94088997 0.78500246 0.72196940 0.76794150 0.87615569
## 183 184 185 186 187 188 189
## 0.87125377 0.77821361 0.75601813 0.74372008 0.84913396 0.85546221 0.73718131
## 190 191 192 193 194 195 196
## 0.75951039 0.62043265 0.74450785 0.60939910 0.26538054 0.45559081 0.32600030
## 197 198 199 200 201 202 203
## 0.42720825 0.26084358 0.38750825 0.29821230 0.49343253 0.56447039 0.41617366
## 204 205 206 207 208 209 210
## 0.35575095 0.56895381 0.34575106 0.35612479 0.37570197 0.45409182 0.38301500
## 211 212 213 214 215 216 217
## 0.81683819 0.81604264 0.83060641 0.84604459 0.88477862 0.79557946 0.74894259
## 218 219 220 221 222
## 0.79681306 0.74388715 0.72730948 0.78620781 0.76129777
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_OS_ADASYN_Model_Indices <- predict(LR_OS_ADASYN_Model,
type = c("link")))
## 1 2 3 4 5 6
## 1.034194007 -2.143155516 -0.060222356 -3.512295020 -3.074561474 -1.799806198
## 7 8 9 10 11 12
## -0.605922800 -1.855882401 0.098712797 1.426659526 0.627524174 -0.094362275
## 13 14 15 16 17 18
## 1.189640156 1.375398467 -0.038204425 -1.233137433 -0.035034220 0.126150588
## 19 20 21 22 23 24
## -0.672726578 0.543655270 -0.536800580 -0.217746823 -0.896596981 0.033643295
## 25 26 27 28 29 30
## 0.198964127 -0.828159573 -0.240641004 -1.850543025 -1.184420539 -3.217679974
## 31 32 33 34 35 36
## -1.613995214 -1.690047948 -3.283166961 -2.070317881 -2.814390118 -2.418701365
## 37 38 39 40 41 42
## -3.384503515 -3.822829815 -0.526381159 -3.445856218 -4.305662512 -0.449197159
## 43 44 45 46 47 48
## 0.624604084 0.239014356 -1.113209442 -1.097279217 -1.183572402 2.318619541
## 49 50 51 52 53 54
## -1.327498549 -1.567384980 -0.824301675 -1.662093081 0.441029528 0.493406603
## 55 56 57 58 59 60
## -0.159736808 0.218802829 1.115369581 1.732475427 1.987699485 0.074401266
## 61 62 63 64 65 66
## 0.008234922 -0.411480728 -0.868288495 -0.516359404 -0.313846676 -0.216509810
## 67 68 69 70 71 72
## 0.490484945 -1.226793467 -0.441645313 0.487089454 1.741922218 2.062917134
## 73 74 75 76 77 78
## 0.521881088 0.086045493 -3.033375348 -0.955519951 -1.224373916 -2.156795762
## 79 80 81 82 83 84
## -2.090973219 -2.994764558 -0.735451606 0.556053884 -1.381420321 -2.816731639
## 85 86 87 88 89 90
## -1.724609805 -1.228823674 -1.084005982 -3.584608772 -2.902044296 -1.264087373
## 91 92 93 94 95 96
## -1.200802614 -0.180520511 0.006339761 -0.102648596 -0.267737627 0.672793868
## 97 98 99 100 101 102
## -0.010252778 -1.007298471 0.088180865 0.718675356 -0.485458441 -0.599881837
## 103 104 105 106 107 108
## -0.532158893 -0.878360151 -1.355029065 -1.813180759 -0.840363591 -1.236286943
## 109 110 111 112 113 114
## -1.260292650 -1.017688967 -0.733136834 3.218780081 0.537073094 2.119773149
## 115 116 117 118 119 120
## 1.265906734 1.329138280 -0.647422117 1.822589419 0.113241617 2.496016154
## 121 122 123 124 125 126
## -0.595751943 2.931588504 1.136759420 2.166321675 0.834670434 2.714627456
## 127 128 129 130 131 132
## 0.948175492 -1.140107373 -0.456240118 3.043044607 1.449029645 1.739142503
## 133 134 135 136 137 138
## -0.595538372 1.485693315 1.599066382 0.919411187 0.822222252 1.109905829
## 139 140 141 142 143 144
## 1.588653339 1.381476015 1.228394001 1.730504144 1.466264352 1.039094181
## 145 146 147 148 149 150
## 0.115602324 -0.597529479 -1.026869141 -0.560440357 -0.637416654 -0.689008549
## 151 152 153 154 155 156
## -0.381958248 -0.355988614 1.829683742 1.675739359 1.665734531 1.592820196
## 157 158 159 160 161 162
## 2.553116895 0.185322642 0.689891703 -0.312840064 -0.590148900 0.515502402
## 163 164 165 166 167 168
## -0.365582021 0.444425835 0.273299828 -0.399987029 0.635964130 0.609824089
## 169 170 171 172 173 174
## -0.609253877 -1.061921255 -0.622091916 -0.602135123 -1.069063118 -0.530211707
## 175 176 177 178 179 180
## 0.486649104 2.696961062 2.276375065 2.767425616 1.295060256 0.954251593
## 181 182 183 184 185 186
## 1.196724048 1.956518579 1.912090011 1.255286332 1.130971418 1.065394456
## 187 188 189 190 191 192
## 1.727824656 1.778100926 1.031369484 1.149997104 0.491384999 1.069531722
## 193 194 195 196 197 198
## 0.444787084 -1.018187850 -0.178106078 -0.726331346 -0.293250606 -1.041588666
## 199 200 201 202 203 204
## -0.457798326 -0.855825310 -0.026271400 0.259325140 -0.338500938 -0.593854473
## 205 206 207 208 209 210
## 0.277584017 -0.637768884 -0.592223728 -0.507831661 -0.184151359 -0.476770541
## 211 212 213 214 215 216
## 1.495071084 1.489762635 1.589931142 1.703909032 2.038482188 1.358891395
## 217 218 219 220 221 222
## 1.092980686 1.366493646 1.066271179 0.981014553 1.302216663 1.159807719
max(LR_OS_ADASYN_Model_Indices)
## [1] 3.21878
min(LR_OS_ADASYN_Model_Indices)
## [1] -4.305663
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_OS_ADASYN_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_OS_ADASYN)
LR_OS_ADASYN_Model_Predictions$LR_OS_ADASYN_Prob <- LR_OS_ADASYN_Model_Probabilities
LR_OS_ADASYN_Model_Predictions$LR_OS_ADASYN_LP <- LR_OS_ADASYN_Model_Indices
LR_OS_ADASYN_Model_Predictions$Class <- as.factor(LR_OS_ADASYN_Model_Predictions$Class)
LR_OS_ADASYN_Model_Predictions$Label <- rep("LR_OS_ADASYN",nrow(LR_OS_ADASYN_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_OS_ADASYN_Model_Predictions %>%
ggplot(aes(x = LR_OS_ADASYN_LP ,
y = LR_OS_ADASYN_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (OS_ADASYN)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.7 Oversampling - Borderline Synthetic Minority Oversampling
Technique (LR_OS_BSMOTE)
Borderline
Synthetic Minority Oversampling Technique generates new instances of
the minority class using nearest neighbors of these cases in the border
region between classes. The algorithm initially classifies instances of
the minority class into the noise, safe and danger categories. If the
surrounding nearest neighbors are from the majority class, instances are
denoted as noise data which will have have adverse effects on the data
distribution and are thus not considered using synthetic data
generation. Instances with more than half of the surrounding neighbors
are from the same minority class are denoted as safe.
Oversampling is only performed within minority class instances in the
danger category which covers those with more than half of the nearest
neighbors from the majority class. | | Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability. | | [A] The class ratio of the original
data was noted at 80:20. | [A.1] Majority Class = Class=M with 111 instances |
[A.2] Minority Class = Class=R with 25 instances | |
[B] The class ratio of the oversampled data was noted
at 50:50 with majority of the added instances being unique values for
the minority class. | [B.1] Majority Class = Class=M with 111 instances |
[B.2] Minority Class = Class=R with 111 instances | |
[C] The logistic regression model from the
stats
package was implemented. The Class
response was regressed against the V1 and V11 predictors. | | [D]
The logistic curve formulated by plotting the predicted probabilities
against the classification index using the logit values showed a
sufficiently balanced logistic profile for the predicted points from
both the majority and minority classes. Minimal overlap between both
classes was observed driving better differentiation although a minimal
skew was still present due to a longer tail for the predicted points
belonging to the majority class. |
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Without Oversampling - Borderline Synthetic Minority Oversampling Technique") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing OS_BSMOTE
# Visualizing the oversampled data using OS_BSMOTE
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_bsmote(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "With Oversampling - Borderline Synthetic Minority Oversampling Technique") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

OS_BSMOTE <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_bsmote(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_OS_BSMOTE <- OS_BSMOTE %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_OS_BSMOTE <- as.data.frame(PMA_PreModelling_Train_LR_OS_BSMOTE))
## V1 V11 Class
## 1 0.914240973 -1.55861587 M
## 2 2.175018299 0.34520609 M
## 3 -0.230553865 -0.16690717 M
## 4 1.231717734 1.85842121 M
## 5 0.424204681 1.89154083 M
## 6 1.143139503 0.55690423 M
## 7 -0.507041623 0.38985083 M
## 8 0.312328639 0.98808941 M
## 9 -0.935159101 0.03686239 M
## 10 0.408807916 -1.63079048 M
## 11 0.738227771 -1.15770286 M
## 12 1.169078303 -0.79246144 M
## 13 0.118911997 -1.30982705 M
## 14 -0.174409184 -1.31877657 M
## 15 1.012209983 -0.76338471 M
## 16 0.813818599 0.26605270 M
## 17 0.207991857 -0.39104999 M
## 18 -0.957451233 0.02573746 M
## 19 0.695599819 -0.11827986 M
## 20 -1.204708453 -0.18639848 M
## 21 -0.068999246 0.13149291 M
## 22 0.527094931 -0.39650480 M
## 23 0.104263091 0.33286517 M
## 24 0.465616266 -0.56497212 M
## 25 -0.507041623 -0.24127887 M
## 26 0.043933590 0.30731955 M
## 27 0.147716923 -0.20173692 M
## 28 0.612463710 0.84401925 M
## 29 -0.068999246 0.63930609 M
## 30 1.395977603 1.55085071 M
## 31 -0.180527407 1.02812486 M
## 32 0.565359423 0.74012531 M
## 33 2.230423635 1.21329169 M
## 34 0.782380194 0.93715691 M
## 35 2.005116912 0.95072168 M
## 36 1.784287809 0.74337479 M
## 37 1.523834205 1.62207084 M
## 38 1.928176420 1.77732114 M
## 39 -1.204708453 0.65264083 M
## 40 1.934080462 1.47897609 M
## 41 0.744203377 2.70773324 M
## 42 1.424344244 -0.63319889 M
## 43 -0.491314245 -0.58236264 M
## 44 0.689392711 -0.83030322 M
## 45 1.381555338 -0.09258957 M
## 46 1.002499594 0.07158504 M
## 47 0.977896818 0.15071601 M
## 48 0.278484686 -2.26945660 M
## 49 1.406688795 0.06372536 M
## 50 1.852613244 0.04399440 M
## 51 1.281675939 -0.27257795 M
## 52 1.390588252 0.33359231 M
## 53 -0.192852168 -0.57752146 M
## 54 -0.180527407 -0.62433567 M
## 55 -0.052243905 -0.17198030 M
## 56 -0.779287302 -0.12994977 M
## 57 -0.041194136 -1.17696945 M
## 58 -0.924156756 -1.24933498 M
## 59 -0.168320027 -1.80173334 M
## 60 -2.093160440 0.59563328 M
## 61 -0.230553865 -0.22058604 M
## 62 -1.038676203 0.48516249 M
## 63 -0.030238811 0.37335493 M
## 64 0.089444831 0.04161877 M
## 65 0.303951059 -0.21715050 M
## 66 -0.132380328 -0.09011428 M
## 67 -1.546463816 0.01457488 M
## 68 -0.499153812 0.87301303 M
## 69 -0.108968439 0.07550792 M
## 70 0.686277848 -1.02337267 M
## 71 -0.721885235 -1.35101476 M
## 72 -3.557061230 -0.28132844 M
## 73 -0.789109306 -0.36302243 M
## 74 -0.750264961 -0.03937819 M
## 75 -0.379134945 2.23365699 M
## 76 -0.371994173 0.60103638 M
## 77 0.401039618 0.45156422 M
## 78 -0.295964244 1.50754826 M
## 79 0.256869245 1.19827720 M
## 80 1.243886483 1.44694263 M
## 81 -0.230553865 0.36255507 M
## 82 -0.256318859 -0.63813480 M
## 83 0.632133128 0.46700244 M
## 84 0.324793230 1.73570326 M
## 85 0.723165726 0.69367752 M
## 86 -1.191144673 1.19711994 M
## 87 -1.177722925 1.07730973 M
## 88 0.142960831 2.42255907 M
## 89 0.443194144 1.74741590 M
## 90 -0.180527407 0.75375436 M
## 91 0.544629613 0.36615870 M
## 92 -1.274767709 0.41409622 M
## 93 -0.539087424 -0.07530230 M
## 94 -0.555415865 0.01776805 M
## 95 0.345301912 -0.27257795 M
## 96 -1.868589923 0.02175517 M
## 97 -0.217863016 -0.21200423 M
## 98 0.625610136 0.17668535 M
## 99 -0.799007156 -0.01833539 M
## 100 -2.013850697 0.05347991 M
## 101 0.537644499 -0.19150300 M
## 102 -0.013979414 0.15531332 M
## 103 -0.935159101 0.53154288 M
## 104 -0.779287302 0.73036007 M
## 105 0.424204681 0.54321732 M
## 106 0.157176488 1.02691760 M
## 107 -0.323092564 0.48794823 M
## 108 0.377448180 0.47190071 M
## 109 0.992715078 0.20396774 M
## 110 0.295518362 0.33867807 M
## 111 0.099342681 0.20698569 M
## 112 -2.902302684 -1.49282840 R
## 113 -0.588710342 -0.46833442 R
## 114 -0.597170869 -1.70542147 R
## 115 -0.289270938 -1.17938825 R
## 116 0.544629613 -1.61762409 R
## 117 0.064360561 0.15607894 R
## 118 -0.168320027 -1.67226688 R
## 119 -0.316256467 -0.26298086 R
## 120 -1.638716948 -1.51501038 R
## 121 0.274190661 0.01776805 R
## 122 -0.860055136 -2.21946189 R
## 123 -0.789109306 -0.84516218 R
## 124 -1.427539411 -1.35491277 R
## 125 0.779483548 -1.33935885 R
## 126 -0.400799099 -2.26338258 R
## 127 -0.302692686 -0.92399268 R
## 128 0.827857790 0.18656255 R
## 129 -0.829167731 0.42261407 R
## 130 -2.434915803 -1.57286451 R
## 131 -1.138280881 -0.92728240 R
## 132 -0.860055136 -1.28443852 R
## 133 -1.125398710 0.66990478 R
## 134 -0.186674974 -1.39954451 R
## 135 -0.008605128 -1.57143555 R
## 136 -1.348878827 -0.41384379 R
## 137 -2.110893443 -0.94312757 R
## 138 -2.045433423 -1.50787056 R
## 139 -2.727906546 -1.49588989 R
## 140 -1.331613148 -0.98926479 R
## 141 -2.884342616 -1.48707040 R
## 142 -1.692543814 -0.65254827 R
## 143 -2.543559671 -1.55426015 R
## 144 -0.200710576 -1.41000300 R
## 145 -0.579994800 -1.75422471 R
## 146 -0.726952717 -1.49758877 R
## 147 -0.427587806 -2.18726646 R
## 148 -0.234409462 -1.43511339 R
## 149 -0.638609937 -1.78645087 R
## 150 -0.424385927 -2.19636412 R
## 151 -0.539891272 -1.86817291 R
## 152 -0.209046173 -1.77581970 R
## 153 -0.198503193 -1.74901247 R
## 154 -0.085576490 -1.62002917 R
## 155 -0.158312844 -1.66594914 R
## 156 -0.255554034 -1.31678566 R
## 157 -0.101553943 -1.63011607 R
## 158 -0.175951690 -1.55887373 R
## 159 -1.536305519 -1.43737038 R
## 160 -1.482954163 -1.39692372 R
## 161 -1.569819776 -1.57734131 R
## 162 -2.179855466 -1.55433108 R
## 163 -1.554224066 -1.59145067 R
## 164 -1.806165345 -1.52717767 R
## 165 -1.620621314 -1.53138141 R
## 166 -0.960871406 -2.12825390 R
## 167 -0.769103007 -2.22816004 R
## 168 -0.703740137 -2.09580937 R
## 169 -1.116503328 -1.82876902 R
## 170 -0.405250123 -1.85968973 R
## 171 -0.708435397 -2.09952354 R
## 172 -0.659278473 -1.82686585 R
## 173 -0.626623939 -2.03480683 R
## 174 -0.336673771 -1.14769148 R
## 175 -0.711430840 -0.69909654 R
## 176 -0.754841345 -0.85071577 R
## 177 -0.842929035 -0.85781982 R
## 178 -0.495358659 -0.89276851 R
## 179 -0.831157170 -1.10551056 R
## 180 -0.652182669 -0.58768701 R
## 181 -1.157096603 -1.32132725 R
## 182 -1.122671198 -1.31705206 R
## 183 -1.243061435 -1.08218655 R
## 184 -0.939653544 -1.29432362 R
## 185 -1.367768189 -1.34748995 R
## 186 -1.542847586 -1.44233004 R
## 187 -1.000539548 -1.01397735 R
## 188 -0.773256346 -2.22776284 R
## 189 -0.226926303 -1.82128283 R
## 190 -0.350132468 -2.05897887 R
## 191 -0.691041322 -2.23562543 R
## 192 -0.250215562 -1.88049954 R
## 193 -0.065672120 -1.67211873 R
## 194 -0.090517759 -1.71595384 R
## 195 -1.736178623 -1.42168866 R
## 196 -2.664395207 -1.53356807 R
## 197 -1.663602473 -1.51681863 R
## 198 -2.158723504 -1.55279557 R
## 199 -2.306693552 -1.43602548 R
## 200 -1.551549377 -0.63013419 R
## 201 -2.133114929 -1.25078205 R
## 202 -1.307953653 -1.17812109 R
## 203 -0.959054286 -1.15735412 R
## 204 -0.801121364 -0.84798724 R
## 205 -1.361056552 -1.25662668 R
## 206 -0.979418863 -0.88992030 R
## 207 -1.031861506 -0.83841126 R
## 208 -1.114796484 -0.95742913 R
## 209 -0.732133486 -1.26089511 R
## 210 -0.999679111 -1.30177802 R
## 211 -0.858361528 -1.27395218 R
## 212 -1.042843755 -1.04979426 R
## 213 -0.827023587 -1.07991659 R
## 214 -0.811668799 -0.98484412 R
## 215 -1.087905903 -0.99194835 R
## 216 -0.185382331 -1.41875093 R
## 217 -0.417948424 -1.57187563 R
## 218 -0.079660886 -1.50284534 R
## 219 -0.022012673 -1.55849323 R
## 220 -0.284049367 -1.19059300 R
## 221 -0.174278835 -1.58372942 R
## 222 -0.277331844 -1.02794573 R
PMA_PreModelling_Train_LR_OS_BSMOTE$Label <- rep("LR_OS_BSMOTE",nrow(PMA_PreModelling_Train_LR_OS_BSMOTE))
##################################
# Verifying the class distribution
# for the oversampled data using OS_BSMOTE
##################################
table(PMA_PreModelling_Train_LR_OS_BSMOTE$Class)
##
## M R
## 111 111
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_OS_BSMOTE_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_OS_BSMOTE,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_OS_BSMOTE_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_OS_BSMOTE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.51118 -0.31186 0.04469 0.46486 2.82180
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.3748 0.4136 -5.742 9.36e-09 ***
## V1 -1.3384 0.3235 -4.138 3.50e-05 ***
## V11 -2.5707 0.3536 -7.271 3.58e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 307.76 on 221 degrees of freedom
## Residual deviance: 126.59 on 219 degrees of freedom
## AIC: 132.59
##
## Number of Fisher Scoring iterations: 6
LR_OS_BSMOTE_Model_Coef <- (as.data.frame(LR_OS_BSMOTE_Model$coefficients))
LR_OS_BSMOTE_Model_Coef$Coef <- rownames(LR_OS_BSMOTE_Model_Coef)
LR_OS_BSMOTE_Model_Coef$Model <- rep("LR_OS_BSMOTE",nrow(LR_OS_BSMOTE_Model_Coef))
colnames(LR_OS_BSMOTE_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_OS_BSMOTE_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -2.374787 (Intercept) LR_OS_BSMOTE
## V1 -1.338444 V1 LR_OS_BSMOTE
## V11 -2.570724 V11 LR_OS_BSMOTE
##################################
# Computing the model predictions
##################################
(LR_OS_BSMOTE_Model_Probabilities <- predict(LR_OS_BSMOTE_Model,
type = c("response")))
## 1 2 3 4 5 6
## 6.006859e-01 2.079892e-03 1.628557e-01 1.505806e-04 4.074472e-04 4.789811e-03
## 7 8 9 10 11 12
## 6.307140e-02 4.806695e-03 2.283076e-01 7.808042e-01 4.045050e-01 1.298446e-01
## 13 14 15 16 17 18
## 6.970438e-01 7.771026e-01 1.459006e-01 1.555055e-02 1.613942e-01 2.387661e-01
## 19 20 21 22 23 24
## 4.734715e-02 4.296759e-01 6.783271e-02 1.129486e-01 3.324495e-02 1.757225e-01
## 25 26 27 28 29 30
## 2.542867e-01 3.828633e-02 1.136601e-01 4.659037e-03 1.934256e-02 2.664560e-04
## 31 32 33 34 35 36
## 8.357633e-03 6.469618e-03 2.077176e-04 2.926166e-03 5.513619e-04 1.261781e-03
## 37 38 39 40 41 42
## 1.869932e-04 7.303152e-05 8.016464e-02 1.560020e-04 3.258266e-05 6.577898e-02
## 43 44 45 46 47 48
## 4.451968e-01 2.381189e-01 1.823696e-02 1.982859e-02 1.677255e-02 9.563376e-01
## 49 50 51 52 53 54
## 1.187490e-02 6.912386e-03 3.262472e-02 6.098557e-03 3.470533e-01 3.709446e-01
## 55 56 57 58 59 60
## 1.343831e-01 2.693959e-01 6.695195e-01 8.883345e-01 9.228887e-01 2.489130e-01
## 61 62 63 64 65 66
## 1.825541e-01 9.693127e-02 3.577391e-02 6.904229e-02 9.767097e-02 1.228251e-01
## 67 68 69 70 71 72
## 4.152266e-01 1.887292e-02 8.143187e-02 3.401731e-01 8.874048e-01 9.572771e-01
## 73 74 75 76 77 78
## 4.048304e-01 2.193652e-01 4.954973e-04 3.161516e-02 1.675101e-02 2.859779e-03
## 79 80 81 82 83 84
## 3.021328e-03 4.265516e-04 4.750576e-02 4.034164e-01 1.187473e-02 6.945722e-04
## 85 86 87 88 89 90
## 5.905442e-03 2.067431e-02 2.743980e-02 1.516378e-04 5.752660e-04 1.677652e-02
## 91 92 93 94 95 96
## 1.720777e-02 1.501903e-01 1.885182e-01 1.574837e-01 1.056248e-01 5.175650e-01
## 97 98 99 100 101 102
## 1.767992e-01 2.493234e-02 2.212795e-01 5.456617e-01 6.900384e-02 5.978552e-02
## 103 104 105 106 107 108
## 7.659193e-02 3.881689e-02 1.288117e-02 5.351133e-03 3.928910e-02 1.641340e-02
## 109 110 111 112 113 114
## 1.437443e-02 2.555662e-02 4.565748e-02 9.952622e-01 4.054342e-01 9.431421e-01
## 115 116 117 118 119 120
## 7.396726e-01 7.416806e-01 5.405582e-02 8.956161e-01 2.183267e-01 9.761822e-01
## 121 122 123 124 125 126
## 5.800597e-02 9.888154e-01 7.014203e-01 9.534252e-01 5.062597e-01 9.816606e-01
## 127 128 129 130 131 132
## 6.000517e-01 1.866196e-02 8.695320e-02 9.928083e-01 8.223755e-01 8.887747e-01
## 133 134 135 136 137 138
## 6.974294e-02 8.134993e-01 8.424349e-01 6.211635e-01 9.465990e-01 9.857863e-01
## 139 140 141 142 143 144
## 9.940706e-01 8.755153e-01 9.950755e-01 8.275139e-01 9.934726e-01 8.203295e-01
## 145 146 147 148 149 150
## 9.483940e-01 9.204214e-01 9.785506e-01 8.359281e-01 9.557410e-01 9.789479e-01
## 151 152 153 154 155 156
## 9.589239e-01 9.220227e-01 9.158404e-01 8.703888e-01 8.928129e-01 7.945136e-01
## 157 158 159 160 161 162
## 8.756336e-01 8.662439e-01 9.669637e-01 9.608783e-01 9.777136e-01 9.894245e-01
## 163 164 165 166 167 168
## 9.780466e-01 9.814510e-01 9.765941e-01 9.876594e-01 9.876613e-01 9.811992e-01
## 169 170 171 172 173 174
## 9.785606e-01 9.501862e-01 9.814891e-01 9.609841e-01 9.757512e-01 7.361844e-01
## 175 176 177 178 179 180
## 5.925718e-01 6.947628e-01 7.228503e-01 6.418228e-01 8.291525e-01 5.022260e-01
## 181 182 183 184 185 186
## 9.289512e-01 9.250913e-01 8.880508e-01 9.011649e-01 9.488224e-01 9.676439e-01
## 187 188 189 190 191 192
## 8.279311e-01 9.877165e-01 9.315634e-01 9.672930e-01 9.865775e-01 9.423702e-01
## 193 194 195 196 197 198
## 8.820166e-01 8.963778e-01 9.735002e-01 9.941401e-01 9.770490e-01 9.890819e-01
## 199 200 201 202 203 204
## 9.879220e-01 7.894770e-01 9.757711e-01 9.171668e-01 8.680804e-01 7.062853e-01
## 205 206 207 208 209 210
## 9.356769e-01 7.727403e-01 7.616280e-01 8.290050e-01 8.637211e-01 9.096805e-01
## 211 212 213 214 215 216
## 8.858527e-01 8.480944e-01 8.188099e-01 7.761464e-01 8.363531e-01 8.206201e-01
## 217 218 219 220 221 222
## 9.025137e-01 8.313514e-01 8.403899e-01 7.438513e-01 8.732280e-01 6.545208e-01
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_OS_BSMOTE_Model_Indices <- predict(LR_OS_BSMOTE_Model,
type = c("link")))
## 1 2 3 4 5
## 0.408324003 -6.173357428 -1.637131633 -8.800861401 -7.805191778
## 6 7 8 9 10
## -5.336462967 -2.698339590 -5.332927345 -1.217892324 1.270358909
## 11 12 13 14 15
## -0.386729015 -1.902333371 0.833259849 1.248860586 -1.767122159
## 16 17 18 19 20
## -4.147986246 -1.647891134 -1.159456434 -3.001743931 -0.283173443
## 21 22 23 24 25
## -2.620467808 -2.060969930 -3.370042340 -1.545601145 -1.075879089
## 26 27 28 29 30
## -3.223623920 -2.053888243 -5.364276686 -3.925915527 -8.230035045
## 31 32 33 34 35
## -4.776187215 -5.034147554 -8.479123183 -5.831131646 -7.502567613
## 36 37 38 39 40
## -6.673968501 -8.584251291 -9.524546462 -2.440112242 -8.765485802
## 41 42 43 44 45
## -10.331697835 -2.653412674 -0.220096954 -1.163020255 -3.985899663
## 46 47 48 49 50
## -3.900602432 -4.071096916 3.086623816 -4.421382008 -4.967504071
## 51 52 53 54 55
## -3.389516181 -5.093585877 -0.632017084 -0.528166665 -1.862747933
## 56 57 58 59 60
## -0.997689924 0.706012696 2.073840593 2.482259349 -1.104418259
## 61 62 63 64 65
## -1.499138052 -2.231796468 -3.294107073 -2.601494709 -2.223374824
## 66 67 68 69 70
## -1.965944774 -0.342400130 -3.950973850 -2.423049316 -0.662522844
## 71 72 73 74 75
## 2.064502155 3.109357894 -0.385378145 -1.269369274 -7.609452965
## 76 77 78 79 80
## -3.421992894 -4.072403649 -5.854146889 -5.799032924 -7.759350558
## 81 82 83 84 85
## -2.998233112 -0.391250280 -4.421396791 -7.271519668 -5.125957993
## 86 87 88 89 90
## -3.857972335 -3.567937550 -8.793864147 -7.460102633 -4.070856278
## 91 92 93 94 95
## -4.045036769 -1.733109397 -1.459667592 -1.677071125 -2.136231934
## 96 97 98 99 100
## 0.070289110 -1.538185522 -3.666340925 -1.258225807 0.183156955
## 101 102 103 104 105
## -2.602093063 -2.755344499 -2.489579517 -3.209309392 -4.339023651
## 106 107 108 109 110
## -5.225081444 -3.196726510 -4.093107340 -4.227825845 -3.640970185
## 111 112 113 114 115
## -3.039855195 5.347432700 -0.382872854 2.808660917 1.044267677
## 116 117 118 119 120
## 1.054722007 -2.862166349 2.149436749 -1.275444526 3.713217647
## 121 122 123 124 125
## -2.787453045 4.481973033 0.854070247 3.019001507 0.025039910
## 126 127 128 129 130
## 3.980192518 0.405680323 -3.962429636 -2.351417088 4.927612286
## 131 132 133 134 135
## 1.532525319 2.078285669 -2.590644755 1.472909778 1.676457742
## 136 137 138 139 140
## 0.494489753 2.875046379 4.239230380 5.121883467 1.950629387
## 141 142 143 144 145
## 5.308591914 1.568109532 5.025199351 1.518581542 2.911131389
## 146 147 148 149 150
## 2.448086088 3.820374146 1.628237508 3.072429056 3.839476193
## 151 152 153 154 155
## 3.150384469 2.470152182 2.387126982 1.904400400 2.119801522
## 156 157 158 159 160
## 1.352350344 1.951715959 1.868148771 3.376554627 3.201169629
## 161 162 163 164 165
## 3.781238284 4.538583963 3.796635579 3.968616703 3.731083062
## 166 167 168 169 170
## 4.382439385 4.382599258 3.954877615 3.820850921 2.948366914
## 171 172 173 174 175
## 3.970710070 3.203988514 3.694841183 1.026230052 0.374607475
## 176 177 178 179 180
## 0.822481260 0.958644246 0.583284207 1.579632898 0.008903908
## 181 182 183 184 185
## 2.570689819 2.513622999 2.070984108 2.210235552 2.919919051
## 186 187 188 189 190
## 3.398060745 1.571035089 4.387137173 2.610956908 3.386912469
## 191 192 193 194 195
## 4.297309508 2.794358127 2.011667419 2.157609899 3.603760225
## 196 197 198 199 200
## 5.133737324 3.751174055 4.506352641 4.404218553 1.321775934
## 201 202 203 204 205
## 3.695683464 2.404459967 1.884091524 0.877410183 2.677351471
## 206 207 208 209 210
## 1.223849729 1.161625750 1.578591698 1.846546094 2.309739633
## 211 212 213 214 215
## 2.049061392 1.719732291 1.508305278 1.243348656 1.631339559
## 216 217 218 219 220
## 1.520554047 2.225472173 1.595235372 1.661131855 1.066083204
## 221 222
## 1.929806883 0.638970888
max(LR_OS_BSMOTE_Model_Indices)
## [1] 5.347433
min(LR_OS_BSMOTE_Model_Indices)
## [1] -10.3317
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_OS_BSMOTE_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_OS_BSMOTE)
LR_OS_BSMOTE_Model_Predictions$LR_OS_BSMOTE_Prob <- LR_OS_BSMOTE_Model_Probabilities
LR_OS_BSMOTE_Model_Predictions$LR_OS_BSMOTE_LP <- LR_OS_BSMOTE_Model_Indices
LR_OS_BSMOTE_Model_Predictions$Class <- as.factor(LR_OS_BSMOTE_Model_Predictions$Class)
LR_OS_BSMOTE_Model_Predictions$Label <- rep("LR_OS_BSMOTE",nrow(LR_OS_BSMOTE_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_OS_BSMOTE_Model_Predictions %>%
ggplot(aes(x = LR_OS_BSMOTE_LP ,
y = LR_OS_BSMOTE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (OS_BSMOTE)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.8 Oversampling - Synthetic Minority Oversampling Technique
(LR_OS_SMOTE)
Synthetic
Minority Oversampling Technique generates new minority instances
between existing instances. The new instances created are not just the
copy of existing minority cases, instead the algorithm takes sample of
feature space for each target class and its neighbors and then generates
new instances that combine the features of the target cases with
features of its neighbors.
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The class ratio of the oversampled data was noted
at 50:50 with majority of the added instances being unique values for
the minority class.
[B.1] Majority Class =
Class=M with 111 instances
[B.2] Minority Class =
Class=R with 111 instances
[C] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[D] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a sufficiently balanced logistic profile for the predicted
points from both the majority and minority classes. Although the ratio
of the unique values and number of instances was relatively high, a
significant overlap between both classes was observed.
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Without Oversampling - Synthetic Minority Oversampling Technique") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing OS_SMOTE
# Visualizing the oversampled data using OS_SMOTE
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_smote(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "With Oversampling - Synthetic Minority Oversampling Technique") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

OS_SMOTE <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_smote(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_OS_SMOTE <- OS_SMOTE %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_OS_SMOTE <- as.data.frame(PMA_PreModelling_Train_LR_OS_SMOTE))
## V1 V11 Class
## 1 0.914240973 -1.558615871 M
## 2 2.175018299 0.345206089 M
## 3 -0.230553865 -0.166907171 M
## 4 1.231717734 1.858421213 M
## 5 0.424204681 1.891540835 M
## 6 1.143139503 0.556904232 M
## 7 -0.507041623 0.389850825 M
## 8 0.312328639 0.988089406 M
## 9 -0.935159101 0.036862390 M
## 10 0.408807916 -1.630790485 M
## 11 0.738227771 -1.157702863 M
## 12 1.169078303 -0.792461442 M
## 13 0.118911997 -1.309827049 M
## 14 -0.174409184 -1.318776567 M
## 15 1.012209983 -0.763384712 M
## 16 0.813818599 0.266052702 M
## 17 0.207991857 -0.391049992 M
## 18 -0.957451233 0.025737464 M
## 19 0.695599819 -0.118279862 M
## 20 -1.204708453 -0.186398477 M
## 21 -0.068999246 0.131492907 M
## 22 0.527094931 -0.396504803 M
## 23 0.104263091 0.332865171 M
## 24 0.465616266 -0.564972117 M
## 25 -0.507041623 -0.241278874 M
## 26 0.043933590 0.307319548 M
## 27 0.147716923 -0.201736917 M
## 28 0.612463710 0.844019251 M
## 29 -0.068999246 0.639306087 M
## 30 1.395977603 1.550850706 M
## 31 -0.180527407 1.028124858 M
## 32 0.565359423 0.740125313 M
## 33 2.230423635 1.213291690 M
## 34 0.782380194 0.937156911 M
## 35 2.005116912 0.950721679 M
## 36 1.784287809 0.743374785 M
## 37 1.523834205 1.622070841 M
## 38 1.928176420 1.777321144 M
## 39 -1.204708453 0.652640830 M
## 40 1.934080462 1.478976095 M
## 41 0.744203377 2.707733238 M
## 42 1.424344244 -0.633198890 M
## 43 -0.491314245 -0.582362640 M
## 44 0.689392711 -0.830303223 M
## 45 1.381555338 -0.092589575 M
## 46 1.002499594 0.071585037 M
## 47 0.977896818 0.150716010 M
## 48 0.278484686 -2.269456602 M
## 49 1.406688795 0.063725362 M
## 50 1.852613244 0.043994401 M
## 51 1.281675939 -0.272577947 M
## 52 1.390588252 0.333592312 M
## 53 -0.192852168 -0.577521460 M
## 54 -0.180527407 -0.624335667 M
## 55 -0.052243905 -0.171980296 M
## 56 -0.779287302 -0.129949769 M
## 57 -0.041194136 -1.176969454 M
## 58 -0.924156756 -1.249334984 M
## 59 -0.168320027 -1.801733345 M
## 60 -2.093160440 0.595633277 M
## 61 -0.230553865 -0.220586042 M
## 62 -1.038676203 0.485162490 M
## 63 -0.030238811 0.373354928 M
## 64 0.089444831 0.041618772 M
## 65 0.303951059 -0.217150498 M
## 66 -0.132380328 -0.090114281 M
## 67 -1.546463816 0.014574884 M
## 68 -0.499153812 0.873013033 M
## 69 -0.108968439 0.075507923 M
## 70 0.686277848 -1.023372673 M
## 71 -0.721885235 -1.351014760 M
## 72 -3.557061230 -0.281328435 M
## 73 -0.789109306 -0.363022432 M
## 74 -0.750264961 -0.039378188 M
## 75 -0.379134945 2.233656987 M
## 76 -0.371994173 0.601036378 M
## 77 0.401039618 0.451564218 M
## 78 -0.295964244 1.507548259 M
## 79 0.256869245 1.198277196 M
## 80 1.243886483 1.446942626 M
## 81 -0.230553865 0.362555066 M
## 82 -0.256318859 -0.638134800 M
## 83 0.632133128 0.467002438 M
## 84 0.324793230 1.735703263 M
## 85 0.723165726 0.693677522 M
## 86 -1.191144673 1.197119945 M
## 87 -1.177722925 1.077309728 M
## 88 0.142960831 2.422559073 M
## 89 0.443194144 1.747415902 M
## 90 -0.180527407 0.753754356 M
## 91 0.544629613 0.366158697 M
## 92 -1.274767709 0.414096217 M
## 93 -0.539087424 -0.075302304 M
## 94 -0.555415865 0.017768054 M
## 95 0.345301912 -0.272577947 M
## 96 -1.868589923 0.021755168 M
## 97 -0.217863016 -0.212004228 M
## 98 0.625610136 0.176685353 M
## 99 -0.799007156 -0.018335390 M
## 100 -2.013850697 0.053479912 M
## 101 0.537644499 -0.191503002 M
## 102 -0.013979414 0.155313323 M
## 103 -0.935159101 0.531542880 M
## 104 -0.779287302 0.730360068 M
## 105 0.424204681 0.543217319 M
## 106 0.157176488 1.026917595 M
## 107 -0.323092564 0.487948233 M
## 108 0.377448180 0.471900708 M
## 109 0.992715078 0.203967739 M
## 110 0.295518362 0.338678072 M
## 111 0.099342681 0.206985690 M
## 112 -2.902302684 -1.492828399 R
## 113 -0.588710342 -0.468334419 R
## 114 -0.597170869 -1.705421467 R
## 115 -0.289270938 -1.179388252 R
## 116 0.544629613 -1.617624095 R
## 117 0.064360561 0.156078935 R
## 118 -0.168320027 -1.672266879 R
## 119 -0.316256467 -0.262980859 R
## 120 -1.638716948 -1.515010380 R
## 121 0.274190661 0.017768054 R
## 122 -0.860055136 -2.219461886 R
## 123 -0.789109306 -0.845162176 R
## 124 -1.427539411 -1.354912772 R
## 125 0.779483548 -1.339358851 R
## 126 -0.400799099 -2.263382579 R
## 127 -0.302692686 -0.923992684 R
## 128 0.827857790 0.186562547 R
## 129 -0.829167731 0.422614069 R
## 130 -2.434915803 -1.572864509 R
## 131 -1.138280881 -0.927282397 R
## 132 -0.860055136 -1.284438518 R
## 133 -1.125398710 0.669904780 R
## 134 -0.186674974 -1.399544510 R
## 135 -0.008605128 -1.571435552 R
## 136 -1.348878827 -0.413843793 R
## 137 -1.599032158 -1.370950282 R
## 138 -2.657432813 -1.322745508 R
## 139 -1.790490183 -1.512346033 R
## 140 -0.810443562 -0.452440072 R
## 141 -0.538605201 -0.548157539 R
## 142 -0.660062049 -0.527920440 R
## 143 -0.982531013 -0.440104448 R
## 144 -0.584866894 -1.740381378 R
## 145 -0.455540713 -2.107842443 R
## 146 -0.515081007 -1.938667573 R
## 147 -0.418862256 -1.691636374 R
## 148 -0.239516596 -1.286153950 R
## 149 -0.200058557 -1.542931405 R
## 150 -0.280193092 -1.198868010 R
## 151 0.168622255 -1.586231937 R
## 152 0.570113701 -1.587429430 R
## 153 0.594021690 -1.559102196 R
## 154 0.043595498 0.162273052 R
## 155 0.729004935 0.182615719 R
## 156 -0.112595852 0.208864171 R
## 157 -0.265229694 -1.679758994 R
## 158 -0.273378159 -1.939394197 R
## 159 -0.321171380 -2.060916205 R
## 160 -0.133312506 -1.650165905 R
## 161 0.134694842 -0.048560153 R
## 162 -0.475824201 -0.459442307 R
## 163 0.218194825 -0.008857141 R
## 164 -1.947338832 -1.537435745 R
## 165 -1.603461320 -1.473605052 R
## 166 -1.761439442 -1.523927754 R
## 167 -0.074193522 -0.550968323 R
## 168 0.602335677 0.117808437 R
## 169 -0.315403865 -0.314371272 R
## 170 -0.641190054 -2.240392925 R
## 171 -0.860055136 -1.717141529 R
## 172 -0.418893061 -1.870481935 R
## 173 -0.634044436 -1.777523552 R
## 174 -0.796581793 -0.891429685 R
## 175 -0.827847489 -1.085017958 R
## 176 -0.655076888 -0.593129261 R
## 177 -0.846646467 -1.201415865 R
## 178 -1.003113777 -1.302204563 R
## 179 -1.477508813 -1.392795503 R
## 180 -1.024062139 -1.032758823 R
## 181 -1.564325317 -1.458612705 R
## 182 -0.046092087 -1.390787072 R
## 183 -0.066053607 -1.212799303 R
## 184 0.616583983 -1.532369401 R
## 185 -0.286655632 -1.973154324 R
## 186 -0.020863313 -1.593062643 R
## 187 -0.264751485 -2.023354050 R
## 188 -0.360121149 -0.914685611 R
## 189 -0.291775634 -1.131727665 R
## 190 -0.270452304 -1.056144682 R
## 191 -0.234639216 -1.202941057 R
## 192 -0.084662664 -0.235306944 R
## 193 -0.539978141 -0.445804920 R
## 194 -0.263127896 -0.317813628 R
## 195 -0.719767227 0.017260698 R
## 196 -0.729847094 0.289854799 R
## 197 -0.854171908 0.443487311 R
## 198 -2.209147026 -1.524018122 R
## 199 -2.549911790 -1.553172404 R
## 200 -2.408990100 -1.545196549 R
## 201 -0.955492262 -1.161926657 R
## 202 -0.882407262 -0.713601583 R
## 203 -1.281912481 -0.577108003 R
## 204 -0.852329589 -0.860030693 R
## 205 -0.879649125 -1.259285873 R
## 206 -0.820084244 -1.036950160 R
## 207 -0.817419038 -1.020447965 R
## 208 -1.332052173 -0.332244311 R
## 209 -0.357437358 -0.215502108 R
## 210 -0.321886669 0.322888990 R
## 211 -1.300027287 -0.176942239 R
## 212 -0.158828325 -1.426424914 R
## 213 -0.255897661 -1.251002529 R
## 214 -0.177300888 -1.538827013 R
## 215 -0.255677280 -1.116706900 R
## 216 -0.162403256 -1.606447414 R
## 217 -0.161098689 -1.424233330 R
## 218 -0.099380030 -1.483810427 R
## 219 -1.358596322 -0.530100664 R
## 220 -0.896995634 0.313447296 R
## 221 -1.278607918 -0.418880974 R
## 222 -1.404025395 -1.073598925 R
PMA_PreModelling_Train_LR_OS_SMOTE$Label <- rep("LR_OS_SMOTE",nrow(PMA_PreModelling_Train_LR_OS_SMOTE))
##################################
# Verifying the class distribution
# for the oversampled data using OS_SMOTE
##################################
table(PMA_PreModelling_Train_LR_OS_SMOTE$Class)
##
## M R
## 111 111
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_OS_SMOTE_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_OS_SMOTE,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_OS_SMOTE_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_OS_SMOTE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.37841 -0.64478 0.05535 0.65629 2.12157
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0415 0.2324 -4.481 7.43e-06 ***
## V1 -0.9329 0.2481 -3.760 0.00017 ***
## V11 -1.7443 0.2506 -6.961 3.38e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 307.76 on 221 degrees of freedom
## Residual deviance: 189.34 on 219 degrees of freedom
## AIC: 195.34
##
## Number of Fisher Scoring iterations: 5
LR_OS_SMOTE_Model_Coef <- (as.data.frame(LR_OS_SMOTE_Model$coefficients))
LR_OS_SMOTE_Model_Coef$Coef <- rownames(LR_OS_SMOTE_Model_Coef)
LR_OS_SMOTE_Model_Coef$Model <- rep("LR_OS_SMOTE",nrow(LR_OS_SMOTE_Model_Coef))
colnames(LR_OS_SMOTE_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_OS_SMOTE_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -1.0415029 (Intercept) LR_OS_SMOTE
## V1 -0.9328753 V1 LR_OS_SMOTE
## V11 -1.7442581 V11 LR_OS_SMOTE
##################################
# Computing the model predictions
##################################
(LR_OS_SMOTE_Model_Probabilities <- predict(LR_OS_SMOTE_Model,
type = c("response")))
## 1 2 3 4 5 6
## 0.695138300 0.024779368 0.369282594 0.004354817 0.008692463 0.043969471
## 7 8 9 10 11 12
## 0.222960465 0.044944135 0.441910102 0.805596919 0.571789299 0.320853998
## 13 14 15 16 17 18
## 0.756252682 0.805571240 0.342037247 0.094083981 0.365066375 0.451846444
## 19 20 21 22 23 24
## 0.184808902 0.600480342 0.230322198 0.301195793 0.151952308 0.379797538
## 25 26 27 28 29 30
## 0.463155988 0.165407411 0.304190208 0.043728465 0.109852360 0.006375734
## 31 32 33 34 35 36
## 0.064984856 0.054172728 0.005280164 0.032108175 0.010248414 0.017939314
## 37 38 39 40 41 42
## 0.005004700 0.002624183 0.258068057 0.004383673 0.001564322 0.219981147
## 43 44 45 46 47 48
## 0.606499788 0.441184242 0.102585458 0.108944043 0.098265568 0.934454823
## 49 50 51 52 53 54
## 0.078354180 0.054862637 0.146577492 0.051143629 0.536373281 0.553768583
## 55 56 57 58 59 60
## 0.333413190 0.478049348 0.740748894 0.880774271 0.905356078 0.468097562
## 61 62 63 64 65 66
## 0.391344086 0.285203257 0.159156007 0.231913539 0.279632581 0.318466956
## 67 68 69 70 71 72
## 0.592843463 0.109231538 0.255107619 0.525804921 0.879578471 0.940894174
## 73 74 75 76 77 78
## 0.581234332 0.432191970 0.010111774 0.148952395 0.099457455 0.032453398
## 79 80 81 82 83 84
## 0.033206650 0.008786310 0.188648708 0.577051822 0.079747910 0.012468425
## 85 86 87 88 89 90
## 0.050879338 0.117284577 0.139201194 0.004494323 0.010955560 0.100848196
## 91 92 93 94 95 96
## 0.100810641 0.360177402 0.399570525 0.364853162 0.291485499 0.660093722
## 97 98 99 100 101 102
## 0.384977539 0.126385323 0.434344509 0.677845021 0.229873159 0.214271334
## 103 104 105 106 107 108
## 0.250440555 0.169600465 0.084343448 0.048368075 0.169214359 0.098258011
## 109 110 111 112 113 114
## 0.089207115 0.129213957 0.183140465 0.986209131 0.580441619 0.923457474
## 115 116 117 118 119 120
## 0.783385955 0.781081414 0.202008970 0.884155454 0.428550245 0.958104757
## 121 122 123 124 125 126
## 0.209443900 0.974222449 0.762931417 0.934228287 0.638191570 0.963747328
## 127 128 129 130 131 132
## 0.701102490 0.105344361 0.267931877 0.981538824 0.837234144 0.880924439
## 133 134 135 136 137 138
## 0.238643107 0.828325850 0.846513532 0.718833604 0.944876905 0.976904705
## 139 140 141 142 143 144
## 0.963269404 0.623331302 0.602783546 0.621302933 0.655370721 0.926883871
## 145 146 147 148 149 150
## 0.955215764 0.943775731 0.908868243 0.806171172 0.862522074 0.787683378
## 151 152 153 154 155 156
## 0.827497084 0.767730195 0.754697624 0.203393521 0.115056286 0.214034101
## 157 158 159 160 161 162
## 0.894339710 0.930624937 0.945467349 0.876660196 0.253041716 0.550766391
## 163 164 165 166 167 168
## 0.226251030 0.969433469 0.953676457 0.963024518 0.497185366 0.140771687
## 169 170 171 172 173 174
## 0.450431899 0.969659778 0.940249029 0.931621958 0.934038188 0.778421640
## 175 176 177 178 179 180
## 0.835253446 0.646610366 0.863408362 0.897122923 0.940823077 0.847511773
## 181 182 183 184 185 186
## 0.950828270 0.806493833 0.756862021 0.741953895 0.935087595 0.852792408
## 187 188 189 190 191 192
## 0.939045080 0.708868092 0.769362054 0.741347551 0.781704288 0.365380305
## 193 194 195 196 197 198
## 0.559671314 0.439870312 0.401274633 0.296039750 0.265373614 0.975340798
## 199 200 201 202 203 204
## 0.982814940 0.980179251 0.867214606 0.736213169 0.761511998 0.777944845
## 205 206 207 208 209 210
## 0.878213438 0.822334132 0.817719367 0.685821378 0.417713421 0.213421318
## 211 212 213 214 215 216
## 0.617722703 0.831278762 0.798876785 0.859120974 0.758566641 0.871250176
## 217 218 219 220 221 222
## 0.831039532 0.837447101 0.759607794 0.320505259 0.707218982 0.894816419
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_OS_SMOTE_Model_Indices <- predict(LR_OS_SMOTE_Model,
type = c("link")))
## 1 2 3 4 5 6
## 0.82425259 -3.67265237 -0.53529572 -5.43210828 -4.73656844 -3.07929429
## 7 8 9 10 11 12
## -1.24849676 -3.05634956 -0.23341359 1.42164977 0.28915520 -0.74984995
## 13 14 15 16 17 18
## 1.13224345 1.42148582 -0.65422868 -2.26475881 -0.55344128 -0.19321306
## 19 20 21 22 23 24
## -1.48410023 0.40746693 -1.20649280 -0.84161007 -1.71937016 -0.49040766
## 25 26 27 28 29 30
## -0.14764367 -1.61853210 -0.82742315 -3.08504263 -2.09225005 -5.04885991
## 31 32 33 34 35 36
## -2.66640847 -2.85988236 -5.23850398 -3.40600965 -4.57033103 -4.00265851
## 37 38 39 40 41 42
## -5.29236048 -5.94035796 -1.05603418 -5.42547492 -6.45873755 -1.26577624
## 43 44 45 46 47 48
## 0.43262277 -0.23635725 -2.16882171 -2.10157285 -2.21664637 2.65722365
## 49 50 51 52 53 54
## -2.46492169 -2.84649772 -1.76170050 -2.92061950 0.14575059 0.21590919
## 55 56 57 58 59 60
## -0.69278785 -0.08785908 1.04986458 1.99978279 2.25820667 -0.12778335
## 61 62 63 64 65 66
## -0.44166592 -0.91879612 -1.66452124 -1.19753768 -0.94628485 -0.76082602
## 67 68 69 70 71 72
## 0.37573267 -2.09861470 -1.07155426 0.10331148 1.98844445 2.76750117
## 73 74 75 76 77 78
## 0.32784250 -0.27291352 -4.58389159 -1.74284131 -2.20326744 -3.39495845
## 79 80 81 82 83 84
## -3.37123462 -4.72573535 -1.45881452 0.31068241 -2.44577712 -4.37200900
## 85 86 87 88 89 90
## -2.92607903 -2.01839960 -1.82194048 -5.40043585 -4.50289216 -2.18783550
## 91 92 93 94 95 96
## -2.18824973 -0.57459425 -0.40725491 -0.55436124 -0.88818027 0.66371190
## 97 98 99 100 101 102
## -0.46847380 -1.93330405 -0.26414720 0.74388595 -1.20902755 -1.29936840
## 103 104 105 106 107 108
## -1.09626404 -1.58846149 -2.38474422 -2.97933833 -1.59120550 -2.21673166
## 109 110 111 112 113 114
## -2.32335472 -1.90792669 -1.49521373 4.26986170 0.32458654 2.49027827
## 115 116 117 118 119 120
## 1.28550832 1.27197948 -1.37378525 2.03238373 -0.28776857 3.12978483
## 121 122 123 124 125 126
## -1.32828070 3.63213568 1.16881866 2.65353097 0.56752363 3.28031608
## 127 128 129 130 131 132
## 0.85255334 -2.13920427 -1.00514081 3.97345164 1.63779107 2.00121359
## 133 134 135 136 137 138
## -1.16013307 1.57380841 1.70751378 0.93868325 2.84148588 3.74476018
## 139 140 141 142 143 144
## 3.26672303 0.50371215 0.41707681 0.49508219 0.64273177 2.53977930
## 145 146 147 148 149 150
## 3.06008104 2.82054007 2.29989380 1.42532065 1.83639738 1.31101754
## 151 152 153 154 155 156
## 1.56799145 1.19553871 1.12382553 -1.36521818 -2.04010260 -1.30077805
## 157 158 159 160 161 162
## 2.13585655 2.59632886 2.85287973 1.96117628 -1.08245498 0.20376771
## 163 164 165 166 167 168
## -1.22960236 3.45680620 3.02467415 3.25982382 -0.01125866 -1.80889534
## 169 170 171 172 173 174
## -0.19892580 3.46447098 2.75595933 2.61187535 2.65044135 1.25649204
## 175 176 177 178 179 180
## 1.62332695 0.60417267 1.84389204 2.16565804 2.76622345 1.71521733
## 181 182 183 184 185 186
## 2.96201461 1.42738687 1.13555187 1.05614883 2.66760147 1.75667237
## 187 188 189 190 191 192
## 2.73472891 0.88989300 1.20471252 1.05298428 1.27562590 -0.55208717
## 193 194 195 196 197 198
## 0.23982821 -0.24168840 -0.40015694 -0.86622815 -1.01822335 3.67763681
## 199 200 201 202 203 204
## 4.04638045 3.90100613 1.87655223 1.02637839 1.16098692 1.25372983
## 205 206 207 208 209 210
## 1.97561964 1.53224216 1.50097177 0.78065553 -0.33216713 -1.30442453
## 211 212 213 214 215 216
## 0.47989340 1.59471732 1.37928899 1.80800819 1.14483716 1.91205799
## 217 218 219 220 221 222
## 1.59301260 1.63935462 1.15053046 -0.75145081 0.88191540 2.14091136
max(LR_OS_SMOTE_Model_Indices)
## [1] 4.269862
min(LR_OS_SMOTE_Model_Indices)
## [1] -6.458738
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_OS_SMOTE_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_OS_SMOTE)
LR_OS_SMOTE_Model_Predictions$LR_OS_SMOTE_Prob <- LR_OS_SMOTE_Model_Probabilities
LR_OS_SMOTE_Model_Predictions$LR_OS_SMOTE_LP <- LR_OS_SMOTE_Model_Indices
LR_OS_SMOTE_Model_Predictions$Class <- as.factor(LR_OS_SMOTE_Model_Predictions$Class)
LR_OS_SMOTE_Model_Predictions$Label <- rep("LR_OS_SMOTE",nrow(LR_OS_SMOTE_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_OS_SMOTE_Model_Predictions %>%
ggplot(aes(x = LR_OS_SMOTE_LP ,
y = LR_OS_SMOTE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (OS_SMOTE)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.5.9 Oversampling - Random Oversampling Examples (LR_OS_ROSE)
Random
Oversampling Examples generates synthetic data by enlarging the
feature space of minority and majority class instances. The algorithm
draws new examples from a conditional kernel density estimate of the two
classes. The kernel is a normal product function centered at each of the
instances with diagonal covariance matrix defined with the width of the
neighborhood which is the asymptotically optimal smoothing matrix under
the assumption of multivariate normality.
Logistic Regression
models the relationship between the probability of an event (among two
outcome levels) by having the log-odds of the event be a linear
combination of a set of predictors weighted by their respective
parameter estimates. The parameters are estimated via maximum likelihood
estimation by testing different values through multiple iterations to
optimize for the best fit of log odds. All of these iterations produce
the log likelihood function, and logistic regression seeks to maximize
this function to find the best parameter estimates. Given the optimal
parameters, the conditional probabilities for each observation can be
calculated, logged, and summed together to yield a predicted
probability.
[A] The class ratio of the original data was noted at
80:20.
[A.1] Majority Class =
Class=M with 111 instances
[A.2] Minority Class =
Class=R with 25 instances
[B] The class ratio of the oversampled data was noted
at 50:50 with majority of the added instances being unique values for
the minority class.
[B.1] Majority Class =
Class=M with 110 instances
[B.2] Minority Class =
Class=R with 112 instances
[C] The logistic regression model from the
stats
package was implemented. The
Class
response was regressed against the
V1 and
V11 predictors.
[D] The logistic curve formulated by plotting the
predicted probabilities against the classification index using the logit
values showed a sufficiently balanced logistic profile for the predicted
points from both the majority and minority classes. Although the ratio
of the unique values and number of instances was relatively high, a
significant overlap between both classes was observed.
##################################
# Visualizing the imbalanced data set
##################################
ggplot(PMA_PreModelling_Train, aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "Without Oversampling - Random Oversampling Examples") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

##################################
# Implementing OS_ROSE
# Visualizing the oversampled data using OS_ROSE
##################################
recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_rose(Class, seed=123456789) %>%
prep() %>%
bake(new_data = NULL) %>%
ggplot(aes(V1, V11, color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
labs(title = "With Oversampling - Random Oversampling Examples") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

OS_ROSE <- recipe(Class ~ V1 + V11, data = PMA_PreModelling_Train) %>%
step_rose(Class, seed=123456789) %>%
prep()
PMA_PreModelling_Train_LR_OS_ROSE <- OS_ROSE %>%
bake(new_data = NULL)
(PMA_PreModelling_Train_LR_OS_ROSE <- as.data.frame(PMA_PreModelling_Train_LR_OS_ROSE))
## V1 V11 Class
## 1 0.453829274 -0.420221366 M
## 2 0.127920224 0.686476420 M
## 3 1.075595629 -0.235554656 M
## 4 2.137901379 -1.178271820 M
## 5 1.092110905 0.456926231 M
## 6 0.857402441 1.398298653 M
## 7 -1.907459325 -0.021176979 M
## 8 0.309616027 -0.332458449 M
## 9 0.693229730 -2.341819879 M
## 10 1.716214396 -0.555582217 M
## 11 -0.770935156 -0.187998880 M
## 12 -1.496837087 1.378598435 M
## 13 -0.106000306 1.534889807 M
## 14 -0.916802330 -1.457363172 M
## 15 0.157063822 0.775286340 M
## 16 -0.064112224 -1.717694803 M
## 17 -0.112461451 -0.079932050 M
## 18 0.125906565 -1.196069844 M
## 19 -0.946156465 0.057997744 M
## 20 -1.748418397 -0.678614715 M
## 21 0.690260404 0.140155995 M
## 22 -0.560768697 0.384716767 M
## 23 0.250861459 -0.498669324 M
## 24 -2.438990535 0.731196382 M
## 25 1.529501791 -1.522171442 M
## 26 2.558447121 1.985131848 M
## 27 -0.387603764 0.364725766 M
## 28 -0.961430471 -0.770858863 M
## 29 0.863038442 2.196168690 M
## 30 1.039157671 0.133492560 M
## 31 -1.697870849 -1.143428379 M
## 32 -0.610125646 -0.127814520 M
## 33 -0.749263527 -0.150861865 M
## 34 -0.279227167 0.078775816 M
## 35 0.194985084 0.683020200 M
## 36 0.210531378 1.454231491 M
## 37 -0.147592914 0.362929787 M
## 38 -0.146821909 -1.873438319 M
## 39 -0.268162038 0.392433523 M
## 40 1.087154437 -1.567477864 M
## 41 -3.258143427 -0.310093040 M
## 42 0.221639048 2.228731592 M
## 43 -1.339735538 1.934676727 M
## 44 1.124251223 2.209600237 M
## 45 -1.920302036 0.658912503 M
## 46 0.320103472 0.002252489 M
## 47 -0.373684831 0.377203482 M
## 48 0.416501911 0.873626212 M
## 49 -0.337904242 0.283054835 M
## 50 -0.488958504 0.548829476 M
## 51 0.837780447 0.174524037 M
## 52 -0.818636727 1.462313516 M
## 53 0.522647205 0.301663855 M
## 54 0.077677433 -0.395192354 M
## 55 0.808685352 0.089522951 M
## 56 0.121157987 -1.487662966 M
## 57 0.930609853 -0.001536725 M
## 58 1.606666273 -0.460572860 M
## 59 1.879030954 1.415353475 M
## 60 -0.259700391 1.886523777 M
## 61 -1.163566178 -0.350578696 M
## 62 -0.473028462 -0.152114025 M
## 63 -0.270091603 0.103561905 M
## 64 -0.381874467 1.008824831 M
## 65 0.108606139 2.879619917 M
## 66 1.200948103 -0.411763531 M
## 67 -0.143328184 -1.493160633 M
## 68 1.381475831 0.613253958 M
## 69 1.396345117 0.826444551 M
## 70 0.096154862 -1.106932315 M
## 71 0.697201801 0.765645056 M
## 72 0.280450179 -0.779498730 M
## 73 -0.819993172 0.582228816 M
## 74 0.113508607 1.089077908 M
## 75 0.324148150 1.032218252 M
## 76 1.331741998 -0.388969403 M
## 77 -0.504012381 -0.624175476 M
## 78 -0.278104210 0.190564663 M
## 79 1.514692613 0.102141162 M
## 80 -1.332434679 0.332392981 M
## 81 0.748018715 2.546676685 M
## 82 -1.401265796 0.653245938 M
## 83 -3.555046462 -0.437014632 M
## 84 -0.797376264 0.590342435 M
## 85 -1.153805197 0.874627800 M
## 86 1.207193590 -0.900335700 M
## 87 -0.892535628 -0.053046654 M
## 88 0.025673058 -0.103210450 M
## 89 0.708863775 0.305835422 M
## 90 -0.479304016 -0.828521934 M
## 91 0.531855505 -1.020076195 M
## 92 0.843587712 -0.127072250 M
## 93 0.321447635 -0.242383581 M
## 94 -0.104688089 -0.955018789 M
## 95 -1.223682265 -0.993723911 M
## 96 -0.060035584 0.193030555 M
## 97 -0.184412189 -0.355696298 M
## 98 0.739698901 -0.008225890 M
## 99 1.751934661 0.060003736 M
## 100 0.012753639 -0.683233510 M
## 101 0.704272106 1.556907557 M
## 102 0.349253294 -1.364315460 M
## 103 -0.054249920 -0.279637524 M
## 104 0.422806210 1.856196052 M
## 105 -0.378916586 -1.774888727 M
## 106 0.896085827 -0.207757879 M
## 107 0.724946782 -1.974278177 M
## 108 1.177536942 1.427597354 M
## 109 0.417306877 0.674306556 M
## 110 0.119668249 0.770502124 M
## 111 -0.295729614 -1.403169748 R
## 112 -0.003104477 -1.303129281 R
## 113 -1.119225438 0.383400942 R
## 114 -1.205336093 -0.269591889 R
## 115 -0.031385116 -0.727154048 R
## 116 -0.262140197 -2.421432946 R
## 117 -0.004372316 -0.720426193 R
## 118 -0.652857082 -0.170061116 R
## 119 -0.243474578 0.765583966 R
## 120 -0.862558574 -2.394683708 R
## 121 -0.084987329 -0.315983611 R
## 122 0.335663272 -1.450942265 R
## 123 -0.547564677 -1.300644608 R
## 124 -0.085770445 -0.386014589 R
## 125 1.158708949 0.194640216 R
## 126 0.838758747 0.838624142 R
## 127 0.129363261 -1.283480849 R
## 128 1.185903335 0.067746005 R
## 129 -0.411610808 -1.398984345 R
## 130 -1.584645597 -3.066092331 R
## 131 -1.195552409 -0.055875719 R
## 132 -2.077490954 -1.016881169 R
## 133 1.123569804 -1.294911355 R
## 134 -1.696277599 1.577388230 R
## 135 -2.606714342 -2.430876734 R
## 136 -0.936576706 -0.967689409 R
## 137 -0.653422361 -0.737952271 R
## 138 1.399478608 -2.336802156 R
## 139 -0.439107621 -0.751356400 R
## 140 0.682169711 -0.645106739 R
## 141 -0.391226812 -1.228763737 R
## 142 -0.175404459 -1.171822064 R
## 143 0.149961923 -0.632053488 R
## 144 -1.306586950 0.989132759 R
## 145 0.192756837 -1.588485717 R
## 146 -1.637705898 -1.115797066 R
## 147 -1.035313367 -0.044642241 R
## 148 -0.454662737 0.327593121 R
## 149 -0.398721880 -2.531360086 R
## 150 -0.521649915 -1.203285944 R
## 151 -1.036925438 -1.419544783 R
## 152 -0.766892871 -1.525224785 R
## 153 -2.249201887 -1.166084343 R
## 154 -0.868992543 0.374276284 R
## 155 -3.122277505 -1.914384957 R
## 156 -1.711225949 0.095212802 R
## 157 0.846502796 -0.576583436 R
## 158 1.436542798 -0.686131512 R
## 159 -0.842933030 -0.712232667 R
## 160 -3.122732053 -1.680571529 R
## 161 0.635603354 -1.077316478 R
## 162 -0.654996459 -2.908798364 R
## 163 0.418427304 -0.144379560 R
## 164 -1.041169997 -1.922001515 R
## 165 -0.441540025 -0.259070590 R
## 166 -1.655681734 0.576523808 R
## 167 0.953177477 -1.711512314 R
## 168 -3.210760176 -1.284814307 R
## 169 -1.217750244 -1.591163354 R
## 170 0.259282153 -0.574268729 R
## 171 -0.921207702 -0.875473417 R
## 172 -0.865572661 -2.663360159 R
## 173 -0.947803456 -0.948766125 R
## 174 -0.460461844 0.286061177 R
## 175 0.419542632 -1.381617010 R
## 176 -0.621694247 -1.065107860 R
## 177 0.222233446 -0.308422959 R
## 178 -0.643581074 0.498325789 R
## 179 -0.260344829 0.290918714 R
## 180 -1.089617028 1.086009599 R
## 181 -0.474076557 -1.474554870 R
## 182 -1.113328689 -0.745280710 R
## 183 -1.367306542 -1.446652698 R
## 184 -0.227162341 -1.792986420 R
## 185 0.869250328 0.385747269 R
## 186 -0.828656357 1.245036664 R
## 187 -0.712628500 -2.041821701 R
## 188 -0.314242395 -0.861387324 R
## 189 -1.931922510 -1.283036428 R
## 190 -1.152889515 0.238829964 R
## 191 -1.757284170 -0.475428583 R
## 192 0.091169879 -0.475918031 R
## 193 0.853052045 -0.195251645 R
## 194 -1.030858895 -1.444796537 R
## 195 -0.458061630 -2.392889498 R
## 196 -0.757335077 -0.976940053 R
## 197 -1.825122741 -1.449896879 R
## 198 0.548521112 -1.625042438 R
## 199 0.815956664 0.127893714 R
## 200 -0.802403884 -1.389200876 R
## 201 -0.504679908 0.215394678 R
## 202 -0.828050004 0.276042407 R
## 203 -0.015700504 -1.362342936 R
## 204 -1.790699584 1.276762042 R
## 205 -0.319616516 -0.271949278 R
## 206 -0.497131472 -0.653931589 R
## 207 -0.434923007 -1.431687637 R
## 208 -2.126603097 -1.447703954 R
## 209 -1.508349941 -1.887116236 R
## 210 -2.644025006 -1.683878336 R
## 211 -1.292563792 0.557925461 R
## 212 0.956404116 0.448577768 R
## 213 0.851149954 -0.762481606 R
## 214 -0.031079759 -1.607449284 R
## 215 -0.131473949 -1.989738433 R
## 216 -0.987534049 -1.880077457 R
## 217 1.350301010 -0.132513674 R
## 218 -1.488226564 -1.509045712 R
## 219 1.732090761 -1.215001202 R
## 220 -1.010449867 -0.143822342 R
## 221 -0.443012681 -2.279874936 R
## 222 -0.918668643 -1.722236931 R
PMA_PreModelling_Train_LR_OS_ROSE$Label <- rep("LR_OS_ROSE",nrow(PMA_PreModelling_Train_LR_OS_ROSE))
##################################
# Verifying the class distribution
# for the oversampled data using OS_ROSE
##################################
table(PMA_PreModelling_Train_LR_OS_ROSE$Class)
##
## M R
## 110 112
##################################
# Formulating the structure of the
# Logistic Regression model
##################################
LR_OS_ROSE_Model <- glm(Class ~ V1 + V11,
data = PMA_PreModelling_Train_LR_OS_ROSE,
family = binomial)
##################################
# Consolidating the model results
##################################
summary(LR_OS_ROSE_Model)
##
## Call:
## glm(formula = Class ~ V1 + V11, family = binomial, data = PMA_PreModelling_Train_LR_OS_ROSE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9858 -0.9400 0.2881 0.9195 1.9172
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4555 0.1701 -2.678 0.007398 **
## V1 -0.5284 0.1593 -3.317 0.000908 ***
## V11 -0.9132 0.1650 -5.535 3.11e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 307.74 on 221 degrees of freedom
## Residual deviance: 249.98 on 219 degrees of freedom
## AIC: 255.98
##
## Number of Fisher Scoring iterations: 4
LR_OS_ROSE_Model_Coef <- (as.data.frame(LR_OS_ROSE_Model$coefficients))
LR_OS_ROSE_Model_Coef$Coef <- rownames(LR_OS_ROSE_Model_Coef)
LR_OS_ROSE_Model_Coef$Model <- rep("LR_OS_ROSE",nrow(LR_OS_ROSE_Model_Coef))
colnames(LR_OS_ROSE_Model_Coef) <- c("Estimates","Coefficients","Model")
print(LR_OS_ROSE_Model_Coef, rownames=FALSE)
## Estimates Coefficients Model
## (Intercept) -0.4555392 (Intercept) LR_OS_ROSE
## V1 -0.5283578 V1 LR_OS_ROSE
## V11 -0.9131847 V11 LR_OS_ROSE
##################################
# Computing the model predictions
##################################
(LR_OS_ROSE_Model_Probabilities <- predict(LR_OS_ROSE_Model,
type = c("response")))
## 1 2 3 4 5 6 7
## 0.42272822 0.24048787 0.30816041 0.37539602 0.19002887 0.10106385 0.63913697
## 8 9 10 11 12 13 14
## 0.42176519 0.78863241 0.29839231 0.53082787 0.28422724 0.14170857 0.79571130
## 15 16 17 18 19 20 21
## 0.22330577 0.75894563 0.41991396 0.63880235 0.49785184 0.74799586 0.27924052
## 22 23 24 25 26 27 28
## 0.37505987 0.46687195 0.54125677 0.53154810 0.02608136 0.35805690 0.68056655
## 29 30 31 32 33 34 35
## 0.05131761 0.24480950 0.81543281 0.49588602 0.51951627 0.40614035 0.23464164
## 36 37 38 39 40 41 42
## 0.13070302 0.32982442 0.79130846 0.33800037 0.59903625 0.82478350 0.06863251
## 43 44 45 46 47 48 49
## 0.18028811 0.04447561 0.48934125 0.34825343 0.35375905 0.18642890 0.36923822
## 50 51 52 53 54 55 56
## 0.33217220 0.25777606 0.20450560 0.26753625 0.46612769 0.27596505 0.69824553
## 57 58 59 60 61 62 63
## 0.27972403 0.29238147 0.06060725 0.11495790 0.61760221 0.48333050 0.39953443
## 64 65 66 67 68 69 70
## 0.23594946 0.04138595 0.32870381 0.72784589 0.14861986 0.12477099 0.62351399
## 71 72 73 74 75 76 77
## 0.17900745 0.52700098 0.36494339 0.18093261 0.17230032 0.30917596 0.59405330
## 78 79 80 81 82 83 84
## 0.38163331 0.20601548 0.48623520 0.04006559 0.42269894 0.86079066 0.36046859
## 85 86 87 88 89 90 91
## 0.34420429 0.43261336 0.51611451 0.40736875 0.24799075 0.63513227 0.54858837
## 92 93 94 95 96 97 98
## 0.31319703 0.40034265 0.61582913 0.74997077 0.35432276 0.49167898 0.30177638
## 99 100 101 102 103 104 105
## 0.19216831 0.54032242 0.09540200 0.64698261 0.45722602 0.08517833 0.79664657
## 106 107 108 109 110 111 112
## 0.32316339 0.72398394 0.08460516 0.21554948 0.22751830 0.72751777 0.67614219
## 113 114 115 116 117 118 119
## 0.44662821 0.60527575 0.55603100 0.86923142 0.55098545 0.51117317 0.26385700
## 120 121 122 123 124 125 126
## 0.89907501 0.46951683 0.66643143 0.73526461 0.48557423 0.22348101 0.15915581
## 127 128 129 130 131 132 133
## 0.65659701 0.24158944 0.73874767 0.96013708 0.55654831 0.82788619 0.53327752
## 134 135 136 137 138 139 140
## 0.26899165 0.95857559 0.71565046 0.63728241 0.71889102 0.61362934 0.44352519
## 141 142 143 144 145 146 147
## 0.70542292 0.66978816 0.51060062 0.33884294 0.70954950 0.80669624 0.53301273
## 148 149 150 151 152 153 154
## 0.37414679 0.88762701 0.71481811 0.80037425 0.79289528 0.85786807 0.41625077
## 155 156 157 158 159 160 161
## 0.94989962 0.58944065 0.40702723 0.35709778 0.65480568 0.93871938 0.54795805
## 162 163 164 165 166 167 168
## 0.92735848 0.36707788 0.86409346 0.50358274 0.47322076 0.64651810 0.91790101
## 169 170 171 172 173 174 175
## 0.83765665 0.48297670 0.69649384 0.91937851 0.71333541 0.38379354 0.64209386
## 176 177 178 179 180 181 182
## 0.69963871 0.42768298 0.36110724 0.35809396 0.29493003 0.75795197 0.69280709
## 183 184 185 186 187 188 189
## 0.83032735 0.78613971 0.21975873 0.23963883 0.85638009 0.62177733 0.85028733
## 190 191 192 193 194 195 196
## 0.48388136 0.71240426 0.48272976 0.32564362 0.80352755 0.87778623 0.69777637
## 197 198 199 200 201 202 203
## 0.86209523 0.67669167 0.26826426 0.77504386 0.40478111 0.43288003 0.68929534
## 204 205 206 207 208 209 210
## 0.33730469 0.49041932 0.59971660 0.74680630 0.87975259 0.88742407 0.92266675
## 211 212 213 214 215 216 217
## 0.42994096 0.20253953 0.44794817 0.73667934 0.80704537 0.85607802 0.25961475
## 218 219 220 221 222
## 0.84668245 0.43507308 0.55222785 0.86535253 0.83237685
##################################
# Creating a classification index
# based from the model predictions
##################################
(LR_OS_ROSE_Model_Indices <- predict(LR_OS_ROSE_Model,
type = c("link")))
## 1 2 3 4 5 6
## -0.311583732 -1.150006656 -0.808733670 -0.509136303 -1.449822617 -2.185459486
## 7 8 9 10 11 12
## 0.571620329 -0.315531291 1.316701585 -0.854965324 0.123468107 -0.923588677
## 13 14 15 16 17 18
## -1.801171064 1.359702258 -1.246504767 1.146907641 -0.323126605 0.570169780
## 19 20 21 22 23 24
## -0.008592705 1.087951913 -0.948232019 -0.510570169 -0.132706621 0.165403137
## 25 26 27 28 29 30
## 0.126360262 -3.620106864 -0.583807739 0.756376634 -2.917040046 -1.126489672
## 31 32 33 34 35 36
## 1.485705461 -0.016456294 0.078104777 -0.379944235 -1.182284732 -1.894757113
## 37 38 39 40 41 42
## -0.708979289 1.332830551 -0.672218011 0.401451079 1.549098576 -2.607887604
## 43 44 45 46 47 48
## -1.514396714 -3.067319346 -0.042641454 -0.626725333 -0.602556377 -1.473383381
## 49 50 51 52 53 54
## -0.535486224 -0.698376867 -1.057559759 -1.358368477 -1.007158785 -0.135697076
## 55 56 57 58 59 60
## -0.964565444 0.838957115 -0.945830902 -0.883845811 -2.740819106 -2.041069195
## 61 62 63 64 65 66
## 0.479383185 -0.066702727 -0.407405360 -1.175016290 -3.142547062 -0.714053376
## 67 68 69 70 71 72
## 0.983720838 -1.745466933 -1.948005631 0.504490295 -1.523086620 0.108109072
## 73 74 75 76 77 78
## -0.553971878 -1.510041697 -1.569411378 -0.803974603 0.380747178 -0.482621426
## 79 80 81 82 83 84
## -1.349112661 -0.055073129 -3.176347023 -0.311703691 1.821872477 -0.573330931
## 85 86 87 88 89 90
## -0.644613971 -0.271196583 0.064480356 -0.374853574 -1.109357179 0.554298385
## 91 92 93 94 95 96
## 0.194968769 -0.785214949 -0.404037608 0.471882126 1.098456379 -0.600091505
## 97 98 99 100 101 102
## -0.033287169 -0.838853164 -1.435982099 0.161640704 -2.249391106 0.605802116
## 103 104 105 106 107 108
## -0.171515133 -2.373982082 1.365465607 -0.739271854 0.964310162 -2.381360181
## 109 110 111 112 113 114
## -1.291793024 -1.222377651 0.982065023 0.736098817 -0.214303591 0.427496730
## 115 116 117 118 119 120
## 0.225069325 1.894180197 0.204653127 0.044700141 -1.026017110 2.186988947
## 121 122 123 124 125 126
## -0.122084091 0.692088786 1.021499657 -0.057719106 -1.245494630 -1.664522726
## 127 128 129 130 131 132
## 0.648165803 -1.143985141 1.039469713 3.181629377 0.227165101 1.570719734
## 133 134 135 136 137 138
## 0.133307162 -0.999744528 3.141578211 0.922987602 0.563588342 0.938967358
## 139 140 141 142 143 144
## 0.462593918 -0.226867299 0.873256809 0.707227114 0.042408819 -0.668454716
## 145 146 147 148 149 150
## 0.893197099 1.428684346 0.132243309 -0.514467642 2.066727983 0.918900944
## 151 152 153 154 155 156
## 1.388635067 1.342466612 1.797694609 -0.338183600 2.942327636 0.361653519
## 157 158 159 160 161 162
## -0.376268405 -0.587983024 0.640231037 2.729052948 0.192423733 2.546803534
## 163 164 165 166 167 168
## -0.544773350 1.849713529 0.014331213 -0.107219562 0.603768915 2.414163843
## 169 170 171 172 173 174
## 1.640894726 -0.068119539 0.830657032 2.433932696 0.911638889 -0.473477302
## 175 176 177 178 179 180
## 0.584463706 0.845578033 -0.291310863 -0.570561627 -0.583646521 -0.871558923
## 181 182 183 184 185 186
## 1.141483829 0.813275667 1.587949042 1.301811601 -1.267072947 -1.154660623
## 187 188 189 190 191 192
## 1.785544023 0.497098959 1.736856426 -0.064496902 0.907089741 -0.069108460
## 193 194 195 196 197 198
## -0.727955121 1.408489278 1.971631378 0.836731432 1.832802349 0.738609301
## 199 200 201 202 203 204
## -1.003446893 1.237014177 -0.385582774 -0.270110234 0.796827032 -0.675328688
## 205 206 207 208 209 210
## -0.038327405 0.404284424 1.081651042 1.990089307 2.064695003 2.479144060
## 211 212 213 214 215 216
## -0.282092041 -1.370497185 -0.208964397 1.028780156 1.430924825 1.783090147
## 217 218 219 220 221 222
## -1.047971859 1.708814429 -0.261182377 0.209676238 1.860476975 1.602567012
max(LR_OS_ROSE_Model_Indices)
## [1] 3.181629
min(LR_OS_ROSE_Model_Indices)
## [1] -3.620107
##################################
# Consolidating the model probabilities
# and classification index
# based from the model predictions
##################################
LR_OS_ROSE_Model_Predictions <- as.data.frame(PMA_PreModelling_Train_LR_OS_ROSE)
LR_OS_ROSE_Model_Predictions$LR_OS_ROSE_Prob <- LR_OS_ROSE_Model_Probabilities
LR_OS_ROSE_Model_Predictions$LR_OS_ROSE_LP <- LR_OS_ROSE_Model_Indices
LR_OS_ROSE_Model_Predictions$Class <- as.factor(LR_OS_ROSE_Model_Predictions$Class)
LR_OS_ROSE_Model_Predictions$Label <- rep("LR_OS_ROSE",nrow(LR_OS_ROSE_Model_Predictions))
##################################
# Formulating the probability curve
# using the consolidated model predictions
##################################
LR_OS_ROSE_Model_Predictions %>%
ggplot(aes(x = LR_OS_ROSE_LP ,
y = LR_OS_ROSE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
ggtitle("Estimated Rock Detection Probabilities Based on Classification Index : Logistic Regression (OS_ROSE)") +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")

1.6 Evaluation Summary
Undersampling and oversampling algorithm comparison:
[A] The option to apply either an undersampling and
oversampling method to augment imbalanced data prior to model training
is predominantly driven by the quality and characteristics of the data
set at hand. For large-sized data sets, undersampling methods may still
prove effective in achieving balanced class distributions while
maintaining a sufficient number of instances and reasonable computation
complexity. For small- to medium-sized data sets, oversampling methods
may help to maximize data augmentation while avoiding data sparsity in
the resulting data update. The effectiveness of certain algorithms may
vary depending on how discriminative the predictors are in
distinguishing between classes.
[B] Using the logistic regression model, the algorithm
which demonstrated relatively better performance among other
undersampling methods is the following:
[B.1] LR_US_NEARMISS: Undersampling - Near Miss
Algorithm
[B.1.1] A sufficiently balanced logistic
profile for the predicted points from both the majority and minority
classes was achieved.
[B.1.2] As an undersampling method applied
on a relatively small-sized original data set, the distribution of
instances was relatively sparse.
[C] Using the logistic regression model, the algorithm
which demonstrated relatively better performance among other
oversampling methods is the following:
[C.1] LR_OS_BSMOTE: Oversampling - Borderline
Synthetic Minority Oversampling Technique
[C.1.1] A sufficiently balanced logistic
profile for the predicted points from both the majority and minority
classes was achieved.
[C.1.2] Minimal overlap between both
classes was obtained although a minimal skew was still observed due to a
longer tail for the predicted points belonging to the majority
class.
##################################
# Visualizing the imbalanced data set
# Visualizing the undersampled and oversampled data
##################################
LR_ClassDistribution <- PMA_PreModelling_Train_LR %>%
ggplot(aes(x = V1 ,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_US_DOWNSAMPLE_ClassDistribution <- PMA_PreModelling_Train_LR_US_DOWNSAMPLE %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
LR_OS_UPSAMPLE_ClassDistribution <- PMA_PreModelling_Train_LR_OS_UPSAMPLE %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
LR_US_NEARMISS_ClassDistribution <- PMA_PreModelling_Train_LR_US_NEARMISS %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
LR_US_TOMEK_ClassDistribution <- PMA_PreModelling_Train_LR_US_TOMEK %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
LR_OS_ADASYN_ClassDistribution <- PMA_PreModelling_Train_LR_OS_ADASYN %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
LR_OS_BSMOTE_ClassDistribution <- PMA_PreModelling_Train_LR_OS_BSMOTE %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
LR_OS_SMOTE_ClassDistribution <- PMA_PreModelling_Train_LR_OS_SMOTE %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
LR_OS_ROSE_ClassDistribution <- PMA_PreModelling_Train_LR_OS_ROSE %>%
ggplot(aes(x = V1,
y = V11,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
scale_x_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
scale_y_continuous( limits=c(-4,4), breaks=seq(-4,4,by=1)) +
theme_bw() +
facet_grid(. ~ Label) +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=12, face="bold"),
axis.title.y = element_text(color="black", size=12, face="bold"),
legend.position="top")
RDD_ClassDistribution <- ggarrange(LR_ClassDistribution,
LR_US_DOWNSAMPLE_ClassDistribution,
LR_OS_UPSAMPLE_ClassDistribution,
LR_US_NEARMISS_ClassDistribution,
LR_US_TOMEK_ClassDistribution,
LR_OS_ADASYN_ClassDistribution,
LR_OS_BSMOTE_ClassDistribution,
LR_OS_SMOTE_ClassDistribution,
LR_OS_ROSE_ClassDistribution,
ncol=3, nrow=3)
annotate_figure(RDD_ClassDistribution,
top = text_grob("Class Distribution",
color = "black",
face = "bold",
size = 14))

##################################
# Replotting the logistic curves
##################################
LR_LogisticCurvePlot <- LR_Model_Predictions %>%
ggplot(aes(x = LR_LP ,
y = LR_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_US_DOWNSAMPLE_LogisticCurvePlot <- LR_US_DOWNSAMPLE_Model_Predictions %>%
ggplot(aes(x = LR_US_DOWNSAMPLE_LP ,
y = LR_US_DOWNSAMPLE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_OS_UPSAMPLE_LogisticCurvePlot <- LR_OS_UPSAMPLE_Model_Predictions %>%
ggplot(aes(x = LR_OS_UPSAMPLE_LP ,
y = LR_OS_UPSAMPLE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_US_NEARMISS_LogisticCurvePlot <- LR_US_NEARMISS_Model_Predictions %>%
ggplot(aes(x = LR_US_NEARMISS_LP ,
y = LR_US_NEARMISS_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_US_TOMEK_LogisticCurvePlot <- LR_US_TOMEK_Model_Predictions %>%
ggplot(aes(x = LR_US_TOMEK_LP ,
y = LR_US_TOMEK_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_OS_ADASYN_LogisticCurvePlot <- LR_OS_ADASYN_Model_Predictions %>%
ggplot(aes(x = LR_OS_ADASYN_LP ,
y = LR_OS_ADASYN_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_OS_BSMOTE_LogisticCurvePlot <- LR_OS_BSMOTE_Model_Predictions %>%
ggplot(aes(x = LR_OS_BSMOTE_LP ,
y = LR_OS_BSMOTE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_OS_SMOTE_LogisticCurvePlot <- LR_OS_SMOTE_Model_Predictions %>%
ggplot(aes(x = LR_OS_SMOTE_LP ,
y = LR_OS_SMOTE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
LR_OS_ROSE_LogisticCurvePlot <- LR_OS_ROSE_Model_Predictions %>%
ggplot(aes(x = LR_OS_ROSE_LP ,
y = LR_OS_ROSE_Prob,
color = Class)) +
scale_colour_manual(values=c("#1846BA55","#B8000055")) +
geom_point(size=5) +
geom_line(color="black") +
xlab("Sonar Object Classification Index (Logit Values)") +
ylab("Estimated Rock Detection Probability") +
labs(color = "Class") +
scale_x_continuous( limits=c(-10,5), breaks=seq(-10,5,by=1)) +
scale_y_continuous( limits=c(0,1), breaks=seq(0,1,by=0.1),labels = scales::percent) +
facet_grid(. ~ Label) +
theme_bw() +
theme(plot.title = element_text(color="black", size=14, face="bold", hjust=0.50),
axis.title.x = element_text(color="black", size=10, face="bold"),
axis.title.y = element_text(color="black", size=10, face="bold"),
legend.position="top")
RLR_LogisticCurvePlot <- ggarrange(LR_LogisticCurvePlot,
LR_US_DOWNSAMPLE_LogisticCurvePlot,
LR_OS_UPSAMPLE_LogisticCurvePlot,
LR_US_NEARMISS_LogisticCurvePlot,
LR_US_TOMEK_LogisticCurvePlot,
LR_OS_ADASYN_LogisticCurvePlot,
LR_OS_BSMOTE_LogisticCurvePlot,
LR_OS_SMOTE_LogisticCurvePlot,
LR_OS_ROSE_LogisticCurvePlot,
ncol=3, nrow=3)
annotate_figure(RLR_LogisticCurvePlot,
top = text_grob("Estimated Rock Detection Probabilities Based on Classification Index",
color = "black",
face = "bold",
size = 14))
